diff --git "a/checkpoint-36000/trainer_state.json" "b/checkpoint-36000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-36000/trainer_state.json" @@ -0,0 +1,252033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.36, + "eval_steps": 500, + "global_step": 36000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1e-05, + "grad_norm": 1.631775788945859, + "learning_rate": 3e-06, + "loss": 10.8658, + "step": 1 + }, + { + "epoch": 2e-05, + "grad_norm": 1.6173222705173065, + "learning_rate": 6e-06, + "loss": 10.8645, + "step": 2 + }, + { + "epoch": 3e-05, + "grad_norm": 1.6387507243802044, + "learning_rate": 9e-06, + "loss": 10.8646, + "step": 3 + }, + { + "epoch": 4e-05, + "grad_norm": 1.5975767011448438, + "learning_rate": 1.2e-05, + "loss": 10.8638, + "step": 4 + }, + { + "epoch": 5e-05, + "grad_norm": 1.6454022013221787, + "learning_rate": 1.5e-05, + "loss": 10.8605, + "step": 5 + }, + { + "epoch": 6e-05, + "grad_norm": 1.6407987097684302, + "learning_rate": 1.8e-05, + "loss": 10.8581, + "step": 6 + }, + { + "epoch": 7e-05, + "grad_norm": 1.609586764602888, + "learning_rate": 2.1000000000000002e-05, + "loss": 10.8444, + "step": 7 + }, + { + "epoch": 8e-05, + "grad_norm": 1.4683048428970586, + "learning_rate": 2.4e-05, + "loss": 10.8173, + "step": 8 + }, + { + "epoch": 9e-05, + "grad_norm": 1.3933728304700357, + "learning_rate": 2.7e-05, + "loss": 10.8102, + "step": 9 + }, + { + "epoch": 0.0001, + "grad_norm": 1.3326098319804733, + "learning_rate": 3e-05, + "loss": 10.7958, + "step": 10 + }, + { + "epoch": 0.00011, + "grad_norm": 1.217237966643813, + "learning_rate": 3.2999999999999996e-05, + "loss": 10.779, + "step": 11 + }, + { + "epoch": 0.00012, + "grad_norm": 1.1764691634458027, + "learning_rate": 3.6e-05, + "loss": 10.7677, + "step": 12 + }, + { + "epoch": 0.00013, + "grad_norm": 1.1304717895604097, + "learning_rate": 3.9e-05, + "loss": 10.7486, + "step": 13 + }, + { + "epoch": 0.00014, + "grad_norm": 1.115888694899127, + "learning_rate": 4.2000000000000004e-05, + "loss": 10.7378, + "step": 14 + }, + { + "epoch": 0.00015, + "grad_norm": 1.1065220670447153, + "learning_rate": 4.4999999999999996e-05, + "loss": 10.7288, + "step": 15 + }, + { + "epoch": 0.00016, + "grad_norm": 1.075226769828573, + "learning_rate": 4.8e-05, + "loss": 10.7115, + "step": 16 + }, + { + "epoch": 0.00017, + "grad_norm": 1.0461893681391048, + "learning_rate": 5.1000000000000006e-05, + "loss": 10.6956, + "step": 17 + }, + { + "epoch": 0.00018, + "grad_norm": 1.0230892582928048, + "learning_rate": 5.4e-05, + "loss": 10.6795, + "step": 18 + }, + { + "epoch": 0.00019, + "grad_norm": 0.9869910790408369, + "learning_rate": 5.7e-05, + "loss": 10.6637, + "step": 19 + }, + { + "epoch": 0.0002, + "grad_norm": 0.9729421058544264, + "learning_rate": 6e-05, + "loss": 10.6515, + "step": 20 + }, + { + "epoch": 0.00021, + "grad_norm": 0.943988636941647, + "learning_rate": 6.3e-05, + "loss": 10.6378, + "step": 21 + }, + { + "epoch": 0.00022, + "grad_norm": 0.9278112602221015, + "learning_rate": 6.599999999999999e-05, + "loss": 10.6233, + "step": 22 + }, + { + "epoch": 0.00023, + "grad_norm": 0.920877936489298, + "learning_rate": 6.9e-05, + "loss": 10.6091, + "step": 23 + }, + { + "epoch": 0.00024, + "grad_norm": 0.9178015901962371, + "learning_rate": 7.2e-05, + "loss": 10.5981, + "step": 24 + }, + { + "epoch": 0.00025, + "grad_norm": 0.9193853191136445, + "learning_rate": 7.500000000000001e-05, + "loss": 10.5835, + "step": 25 + }, + { + "epoch": 0.00026, + "grad_norm": 0.9191743831022944, + "learning_rate": 7.8e-05, + "loss": 10.5705, + "step": 26 + }, + { + "epoch": 0.00027, + "grad_norm": 0.9136913401152261, + "learning_rate": 8.1e-05, + "loss": 10.5585, + "step": 27 + }, + { + "epoch": 0.00028, + "grad_norm": 0.9119853724531574, + "learning_rate": 8.400000000000001e-05, + "loss": 10.5456, + "step": 28 + }, + { + "epoch": 0.00029, + "grad_norm": 0.9130829908837624, + "learning_rate": 8.7e-05, + "loss": 10.5312, + "step": 29 + }, + { + "epoch": 0.0003, + "grad_norm": 0.9186726006674357, + "learning_rate": 8.999999999999999e-05, + "loss": 10.5159, + "step": 30 + }, + { + "epoch": 0.00031, + "grad_norm": 0.9155120967133267, + "learning_rate": 9.3e-05, + "loss": 10.5012, + "step": 31 + }, + { + "epoch": 0.00032, + "grad_norm": 0.9096055134642034, + "learning_rate": 9.6e-05, + "loss": 10.4871, + "step": 32 + }, + { + "epoch": 0.00033, + "grad_norm": 0.91013596598753, + "learning_rate": 9.900000000000001e-05, + "loss": 10.4706, + "step": 33 + }, + { + "epoch": 0.00034, + "grad_norm": 0.9103576711685224, + "learning_rate": 0.00010200000000000001, + "loss": 10.4543, + "step": 34 + }, + { + "epoch": 0.00035, + "grad_norm": 0.9164612613814794, + "learning_rate": 0.00010500000000000002, + "loss": 10.4377, + "step": 35 + }, + { + "epoch": 0.00036, + "grad_norm": 0.9129652960092816, + "learning_rate": 0.000108, + "loss": 10.4202, + "step": 36 + }, + { + "epoch": 0.00037, + "grad_norm": 0.9029365420083331, + "learning_rate": 0.000111, + "loss": 10.4036, + "step": 37 + }, + { + "epoch": 0.00038, + "grad_norm": 0.9075151789728013, + "learning_rate": 0.000114, + "loss": 10.3847, + "step": 38 + }, + { + "epoch": 0.00039, + "grad_norm": 0.9102185085970206, + "learning_rate": 0.000117, + "loss": 10.3654, + "step": 39 + }, + { + "epoch": 0.0004, + "grad_norm": 0.9144897951886523, + "learning_rate": 0.00012, + "loss": 10.3432, + "step": 40 + }, + { + "epoch": 0.00041, + "grad_norm": 0.9044769084629607, + "learning_rate": 0.000123, + "loss": 10.3253, + "step": 41 + }, + { + "epoch": 0.00042, + "grad_norm": 0.9101249031053068, + "learning_rate": 0.000126, + "loss": 10.3047, + "step": 42 + }, + { + "epoch": 0.00043, + "grad_norm": 0.9147428510616606, + "learning_rate": 0.000129, + "loss": 10.282, + "step": 43 + }, + { + "epoch": 0.00044, + "grad_norm": 0.9138907921510238, + "learning_rate": 0.00013199999999999998, + "loss": 10.2606, + "step": 44 + }, + { + "epoch": 0.00045, + "grad_norm": 0.9165726299867081, + "learning_rate": 0.000135, + "loss": 10.2377, + "step": 45 + }, + { + "epoch": 0.00046, + "grad_norm": 0.906013196308877, + "learning_rate": 0.000138, + "loss": 10.216, + "step": 46 + }, + { + "epoch": 0.00047, + "grad_norm": 0.9116233570839986, + "learning_rate": 0.000141, + "loss": 10.1904, + "step": 47 + }, + { + "epoch": 0.00048, + "grad_norm": 0.9086131145608887, + "learning_rate": 0.000144, + "loss": 10.1674, + "step": 48 + }, + { + "epoch": 0.00049, + "grad_norm": 0.9094213201037699, + "learning_rate": 0.000147, + "loss": 10.1435, + "step": 49 + }, + { + "epoch": 0.0005, + "grad_norm": 0.9128293203892458, + "learning_rate": 0.00015000000000000001, + "loss": 10.1173, + "step": 50 + }, + { + "epoch": 0.00051, + "grad_norm": 0.9140261923858894, + "learning_rate": 0.000153, + "loss": 10.0916, + "step": 51 + }, + { + "epoch": 0.00052, + "grad_norm": 0.9121253786029146, + "learning_rate": 0.000156, + "loss": 10.0657, + "step": 52 + }, + { + "epoch": 0.00053, + "grad_norm": 0.9034950978450355, + "learning_rate": 0.000159, + "loss": 10.0413, + "step": 53 + }, + { + "epoch": 0.00054, + "grad_norm": 0.9241206201798855, + "learning_rate": 0.000162, + "loss": 10.0092, + "step": 54 + }, + { + "epoch": 0.00055, + "grad_norm": 0.9149154841379399, + "learning_rate": 0.000165, + "loss": 9.9852, + "step": 55 + }, + { + "epoch": 0.00056, + "grad_norm": 0.9070429364957124, + "learning_rate": 0.00016800000000000002, + "loss": 9.9561, + "step": 56 + }, + { + "epoch": 0.00057, + "grad_norm": 0.9122859536360214, + "learning_rate": 0.000171, + "loss": 9.9285, + "step": 57 + }, + { + "epoch": 0.00058, + "grad_norm": 0.9101615769667563, + "learning_rate": 0.000174, + "loss": 9.9038, + "step": 58 + }, + { + "epoch": 0.00059, + "grad_norm": 0.91462118557662, + "learning_rate": 0.000177, + "loss": 9.872, + "step": 59 + }, + { + "epoch": 0.0006, + "grad_norm": 0.902238261354272, + "learning_rate": 0.00017999999999999998, + "loss": 9.846, + "step": 60 + }, + { + "epoch": 0.00061, + "grad_norm": 0.9010263217287604, + "learning_rate": 0.000183, + "loss": 9.8177, + "step": 61 + }, + { + "epoch": 0.00062, + "grad_norm": 0.9073977759483284, + "learning_rate": 0.000186, + "loss": 9.7857, + "step": 62 + }, + { + "epoch": 0.00063, + "grad_norm": 0.9002035349554564, + "learning_rate": 0.000189, + "loss": 9.7585, + "step": 63 + }, + { + "epoch": 0.00064, + "grad_norm": 0.8964472690991813, + "learning_rate": 0.000192, + "loss": 9.7283, + "step": 64 + }, + { + "epoch": 0.00065, + "grad_norm": 0.9038015098943822, + "learning_rate": 0.00019500000000000002, + "loss": 9.696, + "step": 65 + }, + { + "epoch": 0.00066, + "grad_norm": 0.8969444374465311, + "learning_rate": 0.00019800000000000002, + "loss": 9.6719, + "step": 66 + }, + { + "epoch": 0.00067, + "grad_norm": 0.898249772566312, + "learning_rate": 0.000201, + "loss": 9.642, + "step": 67 + }, + { + "epoch": 0.00068, + "grad_norm": 0.9048900549589218, + "learning_rate": 0.00020400000000000003, + "loss": 9.6091, + "step": 68 + }, + { + "epoch": 0.00069, + "grad_norm": 0.8901553719298861, + "learning_rate": 0.00020700000000000002, + "loss": 9.5816, + "step": 69 + }, + { + "epoch": 0.0007, + "grad_norm": 0.903386863411625, + "learning_rate": 0.00021000000000000004, + "loss": 9.5447, + "step": 70 + }, + { + "epoch": 0.00071, + "grad_norm": 0.8933303377898588, + "learning_rate": 0.00021299999999999997, + "loss": 9.518, + "step": 71 + }, + { + "epoch": 0.00072, + "grad_norm": 0.8969213907091691, + "learning_rate": 0.000216, + "loss": 9.4906, + "step": 72 + }, + { + "epoch": 0.00073, + "grad_norm": 0.8960518018191052, + "learning_rate": 0.00021899999999999998, + "loss": 9.4566, + "step": 73 + }, + { + "epoch": 0.00074, + "grad_norm": 0.8964935065687102, + "learning_rate": 0.000222, + "loss": 9.4297, + "step": 74 + }, + { + "epoch": 0.00075, + "grad_norm": 0.8917043311520961, + "learning_rate": 0.000225, + "loss": 9.3947, + "step": 75 + }, + { + "epoch": 0.00076, + "grad_norm": 0.8997723193205578, + "learning_rate": 0.000228, + "loss": 9.3676, + "step": 76 + }, + { + "epoch": 0.00077, + "grad_norm": 0.8937482133829812, + "learning_rate": 0.000231, + "loss": 9.3393, + "step": 77 + }, + { + "epoch": 0.00078, + "grad_norm": 0.886737139439046, + "learning_rate": 0.000234, + "loss": 9.3091, + "step": 78 + }, + { + "epoch": 0.00079, + "grad_norm": 0.8895258541637481, + "learning_rate": 0.00023700000000000001, + "loss": 9.2733, + "step": 79 + }, + { + "epoch": 0.0008, + "grad_norm": 0.8909958989870267, + "learning_rate": 0.00024, + "loss": 9.2384, + "step": 80 + }, + { + "epoch": 0.00081, + "grad_norm": 0.8966003836256963, + "learning_rate": 0.00024300000000000002, + "loss": 9.2045, + "step": 81 + }, + { + "epoch": 0.00082, + "grad_norm": 0.9055894629552318, + "learning_rate": 0.000246, + "loss": 9.1791, + "step": 82 + }, + { + "epoch": 0.00083, + "grad_norm": 0.8961362604582432, + "learning_rate": 0.00024900000000000004, + "loss": 9.1431, + "step": 83 + }, + { + "epoch": 0.00084, + "grad_norm": 0.8980147614185676, + "learning_rate": 0.000252, + "loss": 9.1116, + "step": 84 + }, + { + "epoch": 0.00085, + "grad_norm": 0.8928314575907911, + "learning_rate": 0.000255, + "loss": 9.0884, + "step": 85 + }, + { + "epoch": 0.00086, + "grad_norm": 0.8950046881177521, + "learning_rate": 0.000258, + "loss": 9.0518, + "step": 86 + }, + { + "epoch": 0.00087, + "grad_norm": 0.8877891352831231, + "learning_rate": 0.000261, + "loss": 9.0297, + "step": 87 + }, + { + "epoch": 0.00088, + "grad_norm": 0.8929677780146621, + "learning_rate": 0.00026399999999999997, + "loss": 8.992, + "step": 88 + }, + { + "epoch": 0.00089, + "grad_norm": 0.8871031960638883, + "learning_rate": 0.000267, + "loss": 8.9663, + "step": 89 + }, + { + "epoch": 0.0009, + "grad_norm": 0.8863614322091727, + "learning_rate": 0.00027, + "loss": 8.9387, + "step": 90 + }, + { + "epoch": 0.00091, + "grad_norm": 0.8806256133072948, + "learning_rate": 0.000273, + "loss": 8.9083, + "step": 91 + }, + { + "epoch": 0.00092, + "grad_norm": 0.8826316050497074, + "learning_rate": 0.000276, + "loss": 8.8757, + "step": 92 + }, + { + "epoch": 0.00093, + "grad_norm": 0.8833603577684496, + "learning_rate": 0.000279, + "loss": 8.8461, + "step": 93 + }, + { + "epoch": 0.00094, + "grad_norm": 0.8819538724809766, + "learning_rate": 0.000282, + "loss": 8.82, + "step": 94 + }, + { + "epoch": 0.00095, + "grad_norm": 0.8776473753829749, + "learning_rate": 0.000285, + "loss": 8.7909, + "step": 95 + }, + { + "epoch": 0.00096, + "grad_norm": 0.8854898687433331, + "learning_rate": 0.000288, + "loss": 8.7608, + "step": 96 + }, + { + "epoch": 0.00097, + "grad_norm": 0.8763526561707659, + "learning_rate": 0.000291, + "loss": 8.7376, + "step": 97 + }, + { + "epoch": 0.00098, + "grad_norm": 0.8773720509513535, + "learning_rate": 0.000294, + "loss": 8.7001, + "step": 98 + }, + { + "epoch": 0.00099, + "grad_norm": 0.8782537783818637, + "learning_rate": 0.000297, + "loss": 8.6785, + "step": 99 + }, + { + "epoch": 0.001, + "grad_norm": 0.876084031734807, + "learning_rate": 0.00030000000000000003, + "loss": 8.6509, + "step": 100 + }, + { + "epoch": 0.00101, + "grad_norm": 0.8775511931302766, + "learning_rate": 0.00030300000000000005, + "loss": 8.6162, + "step": 101 + }, + { + "epoch": 0.00102, + "grad_norm": 0.8666072056658197, + "learning_rate": 0.000306, + "loss": 8.5962, + "step": 102 + }, + { + "epoch": 0.00103, + "grad_norm": 0.8733315070806934, + "learning_rate": 0.000309, + "loss": 8.5716, + "step": 103 + }, + { + "epoch": 0.00104, + "grad_norm": 0.8664419648151436, + "learning_rate": 0.000312, + "loss": 8.5503, + "step": 104 + }, + { + "epoch": 0.00105, + "grad_norm": 0.8699404252946213, + "learning_rate": 0.000315, + "loss": 8.5232, + "step": 105 + }, + { + "epoch": 0.00106, + "grad_norm": 0.8630101619311507, + "learning_rate": 0.000318, + "loss": 8.4946, + "step": 106 + }, + { + "epoch": 0.00107, + "grad_norm": 0.8533006978361278, + "learning_rate": 0.000321, + "loss": 8.4694, + "step": 107 + }, + { + "epoch": 0.00108, + "grad_norm": 0.8571194919918376, + "learning_rate": 0.000324, + "loss": 8.4408, + "step": 108 + }, + { + "epoch": 0.00109, + "grad_norm": 0.8496626885878062, + "learning_rate": 0.000327, + "loss": 8.4213, + "step": 109 + }, + { + "epoch": 0.0011, + "grad_norm": 0.8617458268479945, + "learning_rate": 0.00033, + "loss": 8.3989, + "step": 110 + }, + { + "epoch": 0.00111, + "grad_norm": 0.874730580725934, + "learning_rate": 0.000333, + "loss": 8.3705, + "step": 111 + }, + { + "epoch": 0.00112, + "grad_norm": 0.9211403811176949, + "learning_rate": 0.00033600000000000004, + "loss": 8.351, + "step": 112 + }, + { + "epoch": 0.00113, + "grad_norm": 0.9451163730301329, + "learning_rate": 0.000339, + "loss": 8.3126, + "step": 113 + }, + { + "epoch": 0.00114, + "grad_norm": 0.8518853453666535, + "learning_rate": 0.000342, + "loss": 8.3025, + "step": 114 + }, + { + "epoch": 0.00115, + "grad_norm": 0.8499246309464553, + "learning_rate": 0.00034500000000000004, + "loss": 8.2758, + "step": 115 + }, + { + "epoch": 0.00116, + "grad_norm": 0.8769128820472754, + "learning_rate": 0.000348, + "loss": 8.2536, + "step": 116 + }, + { + "epoch": 0.00117, + "grad_norm": 0.829578266212784, + "learning_rate": 0.000351, + "loss": 8.2211, + "step": 117 + }, + { + "epoch": 0.00118, + "grad_norm": 0.8587574762862499, + "learning_rate": 0.000354, + "loss": 8.2068, + "step": 118 + }, + { + "epoch": 0.00119, + "grad_norm": 0.8383808879241313, + "learning_rate": 0.000357, + "loss": 8.1942, + "step": 119 + }, + { + "epoch": 0.0012, + "grad_norm": 0.8155263912622424, + "learning_rate": 0.00035999999999999997, + "loss": 8.1675, + "step": 120 + }, + { + "epoch": 0.00121, + "grad_norm": 0.8344307084821188, + "learning_rate": 0.000363, + "loss": 8.1409, + "step": 121 + }, + { + "epoch": 0.00122, + "grad_norm": 0.8097993043330719, + "learning_rate": 0.000366, + "loss": 8.1244, + "step": 122 + }, + { + "epoch": 0.00123, + "grad_norm": 0.8029969793277704, + "learning_rate": 0.000369, + "loss": 8.102, + "step": 123 + }, + { + "epoch": 0.00124, + "grad_norm": 0.7829455528112805, + "learning_rate": 0.000372, + "loss": 8.0811, + "step": 124 + }, + { + "epoch": 0.00125, + "grad_norm": 0.8110394816024603, + "learning_rate": 0.000375, + "loss": 8.0581, + "step": 125 + }, + { + "epoch": 0.00126, + "grad_norm": 0.8039928825408066, + "learning_rate": 0.000378, + "loss": 8.0463, + "step": 126 + }, + { + "epoch": 0.00127, + "grad_norm": 0.8369020039958236, + "learning_rate": 0.000381, + "loss": 8.0243, + "step": 127 + }, + { + "epoch": 0.00128, + "grad_norm": 0.9124681746054819, + "learning_rate": 0.000384, + "loss": 8.0062, + "step": 128 + }, + { + "epoch": 0.00129, + "grad_norm": 0.9497790585671452, + "learning_rate": 0.00038700000000000003, + "loss": 7.997, + "step": 129 + }, + { + "epoch": 0.0013, + "grad_norm": 0.895034767303024, + "learning_rate": 0.00039000000000000005, + "loss": 7.9709, + "step": 130 + }, + { + "epoch": 0.00131, + "grad_norm": 0.7344919418773682, + "learning_rate": 0.000393, + "loss": 7.9348, + "step": 131 + }, + { + "epoch": 0.00132, + "grad_norm": 0.8078203789615136, + "learning_rate": 0.00039600000000000003, + "loss": 7.9285, + "step": 132 + }, + { + "epoch": 0.00133, + "grad_norm": 0.8012273324997825, + "learning_rate": 0.00039900000000000005, + "loss": 7.907, + "step": 133 + }, + { + "epoch": 0.00134, + "grad_norm": 0.7227755159346104, + "learning_rate": 0.000402, + "loss": 7.8939, + "step": 134 + }, + { + "epoch": 0.00135, + "grad_norm": 0.7235157354092677, + "learning_rate": 0.00040500000000000003, + "loss": 7.8661, + "step": 135 + }, + { + "epoch": 0.00136, + "grad_norm": 0.7822895950244824, + "learning_rate": 0.00040800000000000005, + "loss": 7.852, + "step": 136 + }, + { + "epoch": 0.00137, + "grad_norm": 0.7608062709985561, + "learning_rate": 0.000411, + "loss": 7.8315, + "step": 137 + }, + { + "epoch": 0.00138, + "grad_norm": 0.7380360973204948, + "learning_rate": 0.00041400000000000003, + "loss": 7.8088, + "step": 138 + }, + { + "epoch": 0.00139, + "grad_norm": 0.684851228306475, + "learning_rate": 0.00041700000000000005, + "loss": 7.7952, + "step": 139 + }, + { + "epoch": 0.0014, + "grad_norm": 0.693462889960616, + "learning_rate": 0.00042000000000000007, + "loss": 7.7794, + "step": 140 + }, + { + "epoch": 0.00141, + "grad_norm": 0.7360428489985282, + "learning_rate": 0.000423, + "loss": 7.7672, + "step": 141 + }, + { + "epoch": 0.00142, + "grad_norm": 0.8088792714181905, + "learning_rate": 0.00042599999999999995, + "loss": 7.7572, + "step": 142 + }, + { + "epoch": 0.00143, + "grad_norm": 0.8184868537412088, + "learning_rate": 0.00042899999999999997, + "loss": 7.7297, + "step": 143 + }, + { + "epoch": 0.00144, + "grad_norm": 0.7328054873489724, + "learning_rate": 0.000432, + "loss": 7.7108, + "step": 144 + }, + { + "epoch": 0.00145, + "grad_norm": 0.742383079085953, + "learning_rate": 0.000435, + "loss": 7.7067, + "step": 145 + }, + { + "epoch": 0.00146, + "grad_norm": 0.8017743593965694, + "learning_rate": 0.00043799999999999997, + "loss": 7.6737, + "step": 146 + }, + { + "epoch": 0.00147, + "grad_norm": 0.7202665590443219, + "learning_rate": 0.000441, + "loss": 7.6628, + "step": 147 + }, + { + "epoch": 0.00148, + "grad_norm": 0.5746342281678257, + "learning_rate": 0.000444, + "loss": 7.6477, + "step": 148 + }, + { + "epoch": 0.00149, + "grad_norm": 0.5929410187247641, + "learning_rate": 0.00044699999999999997, + "loss": 7.6424, + "step": 149 + }, + { + "epoch": 0.0015, + "grad_norm": 0.7151318764270516, + "learning_rate": 0.00045, + "loss": 7.6133, + "step": 150 + }, + { + "epoch": 0.00151, + "grad_norm": 0.6796514647913489, + "learning_rate": 0.000453, + "loss": 7.5947, + "step": 151 + }, + { + "epoch": 0.00152, + "grad_norm": 0.543758313944587, + "learning_rate": 0.000456, + "loss": 7.5852, + "step": 152 + }, + { + "epoch": 0.00153, + "grad_norm": 0.7299692440583161, + "learning_rate": 0.000459, + "loss": 7.5768, + "step": 153 + }, + { + "epoch": 0.00154, + "grad_norm": 0.8631376359970574, + "learning_rate": 0.000462, + "loss": 7.5644, + "step": 154 + }, + { + "epoch": 0.00155, + "grad_norm": 0.9628027146132815, + "learning_rate": 0.000465, + "loss": 7.546, + "step": 155 + }, + { + "epoch": 0.00156, + "grad_norm": 1.6485250632015214, + "learning_rate": 0.000468, + "loss": 7.5336, + "step": 156 + }, + { + "epoch": 0.00157, + "grad_norm": 0.9248857473926935, + "learning_rate": 0.000471, + "loss": 7.5216, + "step": 157 + }, + { + "epoch": 0.00158, + "grad_norm": 0.7465186792591986, + "learning_rate": 0.00047400000000000003, + "loss": 7.4981, + "step": 158 + }, + { + "epoch": 0.00159, + "grad_norm": 0.5902384572619932, + "learning_rate": 0.000477, + "loss": 7.4827, + "step": 159 + }, + { + "epoch": 0.0016, + "grad_norm": 0.799266704152554, + "learning_rate": 0.00048, + "loss": 7.4675, + "step": 160 + }, + { + "epoch": 0.00161, + "grad_norm": 0.7827143911710401, + "learning_rate": 0.00048300000000000003, + "loss": 7.4466, + "step": 161 + }, + { + "epoch": 0.00162, + "grad_norm": 0.9218921339316959, + "learning_rate": 0.00048600000000000005, + "loss": 7.4513, + "step": 162 + }, + { + "epoch": 0.00163, + "grad_norm": 0.7287660540574216, + "learning_rate": 0.0004890000000000001, + "loss": 7.4357, + "step": 163 + }, + { + "epoch": 0.00164, + "grad_norm": 0.5250441215321361, + "learning_rate": 0.000492, + "loss": 7.4302, + "step": 164 + }, + { + "epoch": 0.00165, + "grad_norm": 0.741808200483857, + "learning_rate": 0.000495, + "loss": 7.4077, + "step": 165 + }, + { + "epoch": 0.00166, + "grad_norm": 0.6759515654254841, + "learning_rate": 0.0004980000000000001, + "loss": 7.3847, + "step": 166 + }, + { + "epoch": 0.00167, + "grad_norm": 0.5877266102377413, + "learning_rate": 0.000501, + "loss": 7.373, + "step": 167 + }, + { + "epoch": 0.00168, + "grad_norm": 0.571053515725499, + "learning_rate": 0.000504, + "loss": 7.3498, + "step": 168 + }, + { + "epoch": 0.00169, + "grad_norm": 0.6044099115482432, + "learning_rate": 0.0005070000000000001, + "loss": 7.3473, + "step": 169 + }, + { + "epoch": 0.0017, + "grad_norm": 0.46081682030640647, + "learning_rate": 0.00051, + "loss": 7.3262, + "step": 170 + }, + { + "epoch": 0.00171, + "grad_norm": 0.6775750897944629, + "learning_rate": 0.000513, + "loss": 7.3116, + "step": 171 + }, + { + "epoch": 0.00172, + "grad_norm": 0.57210862548929, + "learning_rate": 0.000516, + "loss": 7.3117, + "step": 172 + }, + { + "epoch": 0.00173, + "grad_norm": 0.5300582190464731, + "learning_rate": 0.0005189999999999999, + "loss": 7.2934, + "step": 173 + }, + { + "epoch": 0.00174, + "grad_norm": 0.7575900839335431, + "learning_rate": 0.000522, + "loss": 7.3114, + "step": 174 + }, + { + "epoch": 0.00175, + "grad_norm": 0.7613961222663432, + "learning_rate": 0.000525, + "loss": 7.2747, + "step": 175 + }, + { + "epoch": 0.00176, + "grad_norm": 0.6765759081090318, + "learning_rate": 0.0005279999999999999, + "loss": 7.2505, + "step": 176 + }, + { + "epoch": 0.00177, + "grad_norm": 0.8933831251648804, + "learning_rate": 0.000531, + "loss": 7.2576, + "step": 177 + }, + { + "epoch": 0.00178, + "grad_norm": 0.7076429855739662, + "learning_rate": 0.000534, + "loss": 7.2471, + "step": 178 + }, + { + "epoch": 0.00179, + "grad_norm": 0.49163795680938555, + "learning_rate": 0.000537, + "loss": 7.2319, + "step": 179 + }, + { + "epoch": 0.0018, + "grad_norm": 0.6284573091622804, + "learning_rate": 0.00054, + "loss": 7.2064, + "step": 180 + }, + { + "epoch": 0.00181, + "grad_norm": 0.5715580305485367, + "learning_rate": 0.000543, + "loss": 7.1941, + "step": 181 + }, + { + "epoch": 0.00182, + "grad_norm": 0.4282299340658738, + "learning_rate": 0.000546, + "loss": 7.1864, + "step": 182 + }, + { + "epoch": 0.00183, + "grad_norm": 0.5948913741099119, + "learning_rate": 0.000549, + "loss": 7.1781, + "step": 183 + }, + { + "epoch": 0.00184, + "grad_norm": 0.4755161712268706, + "learning_rate": 0.000552, + "loss": 7.1637, + "step": 184 + }, + { + "epoch": 0.00185, + "grad_norm": 0.46412280386502286, + "learning_rate": 0.000555, + "loss": 7.14, + "step": 185 + }, + { + "epoch": 0.00186, + "grad_norm": 0.526893652843914, + "learning_rate": 0.000558, + "loss": 7.1495, + "step": 186 + }, + { + "epoch": 0.00187, + "grad_norm": 0.46987960377579885, + "learning_rate": 0.000561, + "loss": 7.1236, + "step": 187 + }, + { + "epoch": 0.00188, + "grad_norm": 0.41993376006980737, + "learning_rate": 0.000564, + "loss": 7.1004, + "step": 188 + }, + { + "epoch": 0.00189, + "grad_norm": 0.5433001024887105, + "learning_rate": 0.000567, + "loss": 7.0986, + "step": 189 + }, + { + "epoch": 0.0019, + "grad_norm": 0.42284758794841465, + "learning_rate": 0.00057, + "loss": 7.0796, + "step": 190 + }, + { + "epoch": 0.00191, + "grad_norm": 0.47158596225286625, + "learning_rate": 0.000573, + "loss": 7.0701, + "step": 191 + }, + { + "epoch": 0.00192, + "grad_norm": 0.4590137866807191, + "learning_rate": 0.000576, + "loss": 7.0611, + "step": 192 + }, + { + "epoch": 0.00193, + "grad_norm": 0.43849327433792495, + "learning_rate": 0.000579, + "loss": 7.0612, + "step": 193 + }, + { + "epoch": 0.00194, + "grad_norm": 0.4326532382468588, + "learning_rate": 0.000582, + "loss": 7.0527, + "step": 194 + }, + { + "epoch": 0.00195, + "grad_norm": 0.508356146068285, + "learning_rate": 0.000585, + "loss": 7.0348, + "step": 195 + }, + { + "epoch": 0.00196, + "grad_norm": 0.534972126993015, + "learning_rate": 0.000588, + "loss": 7.0227, + "step": 196 + }, + { + "epoch": 0.00197, + "grad_norm": 0.693386583628206, + "learning_rate": 0.000591, + "loss": 7.0198, + "step": 197 + }, + { + "epoch": 0.00198, + "grad_norm": 1.4796610947894584, + "learning_rate": 0.000594, + "loss": 7.0311, + "step": 198 + }, + { + "epoch": 0.00199, + "grad_norm": 0.8596271717690976, + "learning_rate": 0.0005970000000000001, + "loss": 7.0019, + "step": 199 + }, + { + "epoch": 0.002, + "grad_norm": 0.5642900417004818, + "learning_rate": 0.0006000000000000001, + "loss": 6.991, + "step": 200 + }, + { + "epoch": 0.00201, + "grad_norm": 0.8692961874823039, + "learning_rate": 0.000603, + "loss": 6.9753, + "step": 201 + }, + { + "epoch": 0.00202, + "grad_norm": 0.6679575424248658, + "learning_rate": 0.0006060000000000001, + "loss": 6.9579, + "step": 202 + }, + { + "epoch": 0.00203, + "grad_norm": 0.8457647455673973, + "learning_rate": 0.0006090000000000001, + "loss": 6.9683, + "step": 203 + }, + { + "epoch": 0.00204, + "grad_norm": 0.9450482633193271, + "learning_rate": 0.000612, + "loss": 6.9556, + "step": 204 + }, + { + "epoch": 0.00205, + "grad_norm": 1.3090292735766231, + "learning_rate": 0.000615, + "loss": 6.9523, + "step": 205 + }, + { + "epoch": 0.00206, + "grad_norm": 0.6638095483572409, + "learning_rate": 0.000618, + "loss": 6.9303, + "step": 206 + }, + { + "epoch": 0.00207, + "grad_norm": 0.4840297186697793, + "learning_rate": 0.000621, + "loss": 6.9174, + "step": 207 + }, + { + "epoch": 0.00208, + "grad_norm": 0.7834386214905589, + "learning_rate": 0.000624, + "loss": 6.9192, + "step": 208 + }, + { + "epoch": 0.00209, + "grad_norm": 0.69675327719343, + "learning_rate": 0.000627, + "loss": 6.9018, + "step": 209 + }, + { + "epoch": 0.0021, + "grad_norm": 0.5517351762495105, + "learning_rate": 0.00063, + "loss": 6.8834, + "step": 210 + }, + { + "epoch": 0.00211, + "grad_norm": 0.6866030941755482, + "learning_rate": 0.000633, + "loss": 6.8831, + "step": 211 + }, + { + "epoch": 0.00212, + "grad_norm": 0.4815850149383259, + "learning_rate": 0.000636, + "loss": 6.8711, + "step": 212 + }, + { + "epoch": 0.00213, + "grad_norm": 0.5445114180165818, + "learning_rate": 0.000639, + "loss": 6.8513, + "step": 213 + }, + { + "epoch": 0.00214, + "grad_norm": 0.5893761744890885, + "learning_rate": 0.000642, + "loss": 6.8511, + "step": 214 + }, + { + "epoch": 0.00215, + "grad_norm": 0.3989068076407255, + "learning_rate": 0.000645, + "loss": 6.8428, + "step": 215 + }, + { + "epoch": 0.00216, + "grad_norm": 0.4674719209271809, + "learning_rate": 0.000648, + "loss": 6.8374, + "step": 216 + }, + { + "epoch": 0.00217, + "grad_norm": 0.556887374004828, + "learning_rate": 0.000651, + "loss": 6.8123, + "step": 217 + }, + { + "epoch": 0.00218, + "grad_norm": 0.5348764881519483, + "learning_rate": 0.000654, + "loss": 6.815, + "step": 218 + }, + { + "epoch": 0.00219, + "grad_norm": 0.6261358280484484, + "learning_rate": 0.000657, + "loss": 6.7906, + "step": 219 + }, + { + "epoch": 0.0022, + "grad_norm": 0.5490386646627615, + "learning_rate": 0.00066, + "loss": 6.8058, + "step": 220 + }, + { + "epoch": 0.00221, + "grad_norm": 0.5297655781082383, + "learning_rate": 0.0006630000000000001, + "loss": 6.7838, + "step": 221 + }, + { + "epoch": 0.00222, + "grad_norm": 0.53116953133404, + "learning_rate": 0.000666, + "loss": 6.7711, + "step": 222 + }, + { + "epoch": 0.00223, + "grad_norm": 0.46985855580572156, + "learning_rate": 0.000669, + "loss": 6.7662, + "step": 223 + }, + { + "epoch": 0.00224, + "grad_norm": 0.48892819667849163, + "learning_rate": 0.0006720000000000001, + "loss": 6.7615, + "step": 224 + }, + { + "epoch": 0.00225, + "grad_norm": 0.5426443115029689, + "learning_rate": 0.000675, + "loss": 6.7505, + "step": 225 + }, + { + "epoch": 0.00226, + "grad_norm": 0.47341745143430014, + "learning_rate": 0.000678, + "loss": 6.7457, + "step": 226 + }, + { + "epoch": 0.00227, + "grad_norm": 0.47753897999990824, + "learning_rate": 0.0006810000000000001, + "loss": 6.7186, + "step": 227 + }, + { + "epoch": 0.00228, + "grad_norm": 0.43835516232945165, + "learning_rate": 0.000684, + "loss": 6.721, + "step": 228 + }, + { + "epoch": 0.00229, + "grad_norm": 0.3666587821660354, + "learning_rate": 0.000687, + "loss": 6.7162, + "step": 229 + }, + { + "epoch": 0.0023, + "grad_norm": 0.5954344273025705, + "learning_rate": 0.0006900000000000001, + "loss": 6.6971, + "step": 230 + }, + { + "epoch": 0.00231, + "grad_norm": 0.8324250780860898, + "learning_rate": 0.000693, + "loss": 6.6912, + "step": 231 + }, + { + "epoch": 0.00232, + "grad_norm": 1.1082992895496584, + "learning_rate": 0.000696, + "loss": 6.7117, + "step": 232 + }, + { + "epoch": 0.00233, + "grad_norm": 0.8989391942429391, + "learning_rate": 0.0006990000000000001, + "loss": 6.6931, + "step": 233 + }, + { + "epoch": 0.00234, + "grad_norm": 0.8501087453831264, + "learning_rate": 0.000702, + "loss": 6.6816, + "step": 234 + }, + { + "epoch": 0.00235, + "grad_norm": 0.9709457331919181, + "learning_rate": 0.000705, + "loss": 6.6715, + "step": 235 + }, + { + "epoch": 0.00236, + "grad_norm": 0.8996312948341649, + "learning_rate": 0.000708, + "loss": 6.6542, + "step": 236 + }, + { + "epoch": 0.00237, + "grad_norm": 0.7941572817187773, + "learning_rate": 0.0007109999999999999, + "loss": 6.6634, + "step": 237 + }, + { + "epoch": 0.00238, + "grad_norm": 0.649710293154646, + "learning_rate": 0.000714, + "loss": 6.6483, + "step": 238 + }, + { + "epoch": 0.00239, + "grad_norm": 0.7175873388046764, + "learning_rate": 0.000717, + "loss": 6.6317, + "step": 239 + }, + { + "epoch": 0.0024, + "grad_norm": 1.0726439429102004, + "learning_rate": 0.0007199999999999999, + "loss": 6.6535, + "step": 240 + }, + { + "epoch": 0.00241, + "grad_norm": 1.1551390926973517, + "learning_rate": 0.000723, + "loss": 6.6508, + "step": 241 + }, + { + "epoch": 0.00242, + "grad_norm": 0.8245355038796127, + "learning_rate": 0.000726, + "loss": 6.615, + "step": 242 + }, + { + "epoch": 0.00243, + "grad_norm": 0.7119399485811939, + "learning_rate": 0.000729, + "loss": 6.6026, + "step": 243 + }, + { + "epoch": 0.00244, + "grad_norm": 0.6396700306701443, + "learning_rate": 0.000732, + "loss": 6.6042, + "step": 244 + }, + { + "epoch": 0.00245, + "grad_norm": 0.668492143187707, + "learning_rate": 0.000735, + "loss": 6.5953, + "step": 245 + }, + { + "epoch": 0.00246, + "grad_norm": 0.6209038847600604, + "learning_rate": 0.000738, + "loss": 6.5759, + "step": 246 + }, + { + "epoch": 0.00247, + "grad_norm": 0.49470830317055475, + "learning_rate": 0.000741, + "loss": 6.5677, + "step": 247 + }, + { + "epoch": 0.00248, + "grad_norm": 0.5745764116827149, + "learning_rate": 0.000744, + "loss": 6.5775, + "step": 248 + }, + { + "epoch": 0.00249, + "grad_norm": 0.5319509172858093, + "learning_rate": 0.000747, + "loss": 6.5558, + "step": 249 + }, + { + "epoch": 0.0025, + "grad_norm": 0.482084360804442, + "learning_rate": 0.00075, + "loss": 6.556, + "step": 250 + }, + { + "epoch": 0.00251, + "grad_norm": 0.46516739275647623, + "learning_rate": 0.000753, + "loss": 6.521, + "step": 251 + }, + { + "epoch": 0.00252, + "grad_norm": 0.4629119115625355, + "learning_rate": 0.000756, + "loss": 6.531, + "step": 252 + }, + { + "epoch": 0.00253, + "grad_norm": 0.37719629506333596, + "learning_rate": 0.000759, + "loss": 6.519, + "step": 253 + }, + { + "epoch": 0.00254, + "grad_norm": 0.44323602664762185, + "learning_rate": 0.000762, + "loss": 6.5149, + "step": 254 + }, + { + "epoch": 0.00255, + "grad_norm": 0.38153047495099895, + "learning_rate": 0.0007650000000000001, + "loss": 6.5129, + "step": 255 + }, + { + "epoch": 0.00256, + "grad_norm": 0.5270908471121704, + "learning_rate": 0.000768, + "loss": 6.4934, + "step": 256 + }, + { + "epoch": 0.00257, + "grad_norm": 0.6201344591076082, + "learning_rate": 0.000771, + "loss": 6.4997, + "step": 257 + }, + { + "epoch": 0.00258, + "grad_norm": 0.6391276132887356, + "learning_rate": 0.0007740000000000001, + "loss": 6.477, + "step": 258 + }, + { + "epoch": 0.00259, + "grad_norm": 0.6374758421191778, + "learning_rate": 0.000777, + "loss": 6.4787, + "step": 259 + }, + { + "epoch": 0.0026, + "grad_norm": 0.5597091224464362, + "learning_rate": 0.0007800000000000001, + "loss": 6.4607, + "step": 260 + }, + { + "epoch": 0.00261, + "grad_norm": 0.587169694241395, + "learning_rate": 0.0007830000000000001, + "loss": 6.4722, + "step": 261 + }, + { + "epoch": 0.00262, + "grad_norm": 0.6112267949829847, + "learning_rate": 0.000786, + "loss": 6.4511, + "step": 262 + }, + { + "epoch": 0.00263, + "grad_norm": 0.5933922824160996, + "learning_rate": 0.0007890000000000001, + "loss": 6.4574, + "step": 263 + }, + { + "epoch": 0.00264, + "grad_norm": 0.6560299493456899, + "learning_rate": 0.0007920000000000001, + "loss": 6.4408, + "step": 264 + }, + { + "epoch": 0.00265, + "grad_norm": 0.9913628812090025, + "learning_rate": 0.000795, + "loss": 6.4422, + "step": 265 + }, + { + "epoch": 0.00266, + "grad_norm": 1.340981155098937, + "learning_rate": 0.0007980000000000001, + "loss": 6.4533, + "step": 266 + }, + { + "epoch": 0.00267, + "grad_norm": 0.8266116512325479, + "learning_rate": 0.0008010000000000001, + "loss": 6.4319, + "step": 267 + }, + { + "epoch": 0.00268, + "grad_norm": 0.9896228951890642, + "learning_rate": 0.000804, + "loss": 6.4378, + "step": 268 + }, + { + "epoch": 0.00269, + "grad_norm": 1.2352739008881923, + "learning_rate": 0.0008070000000000001, + "loss": 6.4279, + "step": 269 + }, + { + "epoch": 0.0027, + "grad_norm": 1.1652427209458782, + "learning_rate": 0.0008100000000000001, + "loss": 6.4326, + "step": 270 + }, + { + "epoch": 0.00271, + "grad_norm": 1.0407181933539849, + "learning_rate": 0.000813, + "loss": 6.4319, + "step": 271 + }, + { + "epoch": 0.00272, + "grad_norm": 0.8880696455452757, + "learning_rate": 0.0008160000000000001, + "loss": 6.4138, + "step": 272 + }, + { + "epoch": 0.00273, + "grad_norm": 0.8477724135782442, + "learning_rate": 0.0008190000000000001, + "loss": 6.404, + "step": 273 + }, + { + "epoch": 0.00274, + "grad_norm": 0.7818547901656048, + "learning_rate": 0.000822, + "loss": 6.383, + "step": 274 + }, + { + "epoch": 0.00275, + "grad_norm": 0.6915610404761925, + "learning_rate": 0.0008250000000000001, + "loss": 6.3888, + "step": 275 + }, + { + "epoch": 0.00276, + "grad_norm": 0.6369714396426732, + "learning_rate": 0.0008280000000000001, + "loss": 6.3775, + "step": 276 + }, + { + "epoch": 0.00277, + "grad_norm": 0.6792843462530734, + "learning_rate": 0.0008310000000000001, + "loss": 6.3726, + "step": 277 + }, + { + "epoch": 0.00278, + "grad_norm": 0.6716653191335978, + "learning_rate": 0.0008340000000000001, + "loss": 6.3561, + "step": 278 + }, + { + "epoch": 0.00279, + "grad_norm": 0.6104270551210891, + "learning_rate": 0.0008370000000000001, + "loss": 6.3562, + "step": 279 + }, + { + "epoch": 0.0028, + "grad_norm": 0.5327216367370322, + "learning_rate": 0.0008400000000000001, + "loss": 6.3379, + "step": 280 + }, + { + "epoch": 0.00281, + "grad_norm": 0.4495801132850456, + "learning_rate": 0.0008430000000000001, + "loss": 6.3253, + "step": 281 + }, + { + "epoch": 0.00282, + "grad_norm": 0.4185635012011635, + "learning_rate": 0.000846, + "loss": 6.3254, + "step": 282 + }, + { + "epoch": 0.00283, + "grad_norm": 0.41306707794715253, + "learning_rate": 0.0008489999999999999, + "loss": 6.3154, + "step": 283 + }, + { + "epoch": 0.00284, + "grad_norm": 0.447351018324713, + "learning_rate": 0.0008519999999999999, + "loss": 6.3075, + "step": 284 + }, + { + "epoch": 0.00285, + "grad_norm": 0.4656314656211844, + "learning_rate": 0.000855, + "loss": 6.3102, + "step": 285 + }, + { + "epoch": 0.00286, + "grad_norm": 0.5287748664566101, + "learning_rate": 0.0008579999999999999, + "loss": 6.2881, + "step": 286 + }, + { + "epoch": 0.00287, + "grad_norm": 0.60454227039484, + "learning_rate": 0.000861, + "loss": 6.2937, + "step": 287 + }, + { + "epoch": 0.00288, + "grad_norm": 0.6409086244349441, + "learning_rate": 0.000864, + "loss": 6.2743, + "step": 288 + }, + { + "epoch": 0.00289, + "grad_norm": 0.7540915605033448, + "learning_rate": 0.0008669999999999999, + "loss": 6.2909, + "step": 289 + }, + { + "epoch": 0.0029, + "grad_norm": 0.9532532853232767, + "learning_rate": 0.00087, + "loss": 6.2777, + "step": 290 + }, + { + "epoch": 0.00291, + "grad_norm": 0.9297633606905631, + "learning_rate": 0.000873, + "loss": 6.278, + "step": 291 + }, + { + "epoch": 0.00292, + "grad_norm": 0.6544361038243887, + "learning_rate": 0.0008759999999999999, + "loss": 6.2635, + "step": 292 + }, + { + "epoch": 0.00293, + "grad_norm": 0.9586546136156446, + "learning_rate": 0.000879, + "loss": 6.2582, + "step": 293 + }, + { + "epoch": 0.00294, + "grad_norm": 0.8674924960686783, + "learning_rate": 0.000882, + "loss": 6.2684, + "step": 294 + }, + { + "epoch": 0.00295, + "grad_norm": 0.8596325280201164, + "learning_rate": 0.0008849999999999999, + "loss": 6.2363, + "step": 295 + }, + { + "epoch": 0.00296, + "grad_norm": 0.9927641151458286, + "learning_rate": 0.000888, + "loss": 6.2541, + "step": 296 + }, + { + "epoch": 0.00297, + "grad_norm": 1.342485766358639, + "learning_rate": 0.000891, + "loss": 6.2408, + "step": 297 + }, + { + "epoch": 0.00298, + "grad_norm": 1.1878316085061287, + "learning_rate": 0.0008939999999999999, + "loss": 6.2537, + "step": 298 + }, + { + "epoch": 0.00299, + "grad_norm": 0.9496422749623654, + "learning_rate": 0.000897, + "loss": 6.2242, + "step": 299 + }, + { + "epoch": 0.003, + "grad_norm": 1.122193003605518, + "learning_rate": 0.0009, + "loss": 6.2361, + "step": 300 + }, + { + "epoch": 0.00301, + "grad_norm": 1.129970594986655, + "learning_rate": 0.0009029999999999999, + "loss": 6.2273, + "step": 301 + }, + { + "epoch": 0.00302, + "grad_norm": 1.0740447263196922, + "learning_rate": 0.000906, + "loss": 6.2071, + "step": 302 + }, + { + "epoch": 0.00303, + "grad_norm": 1.1900410452977912, + "learning_rate": 0.000909, + "loss": 6.2313, + "step": 303 + }, + { + "epoch": 0.00304, + "grad_norm": 0.804691464481299, + "learning_rate": 0.000912, + "loss": 6.2111, + "step": 304 + }, + { + "epoch": 0.00305, + "grad_norm": 0.7167209084416579, + "learning_rate": 0.000915, + "loss": 6.2106, + "step": 305 + }, + { + "epoch": 0.00306, + "grad_norm": 0.5686498260282739, + "learning_rate": 0.000918, + "loss": 6.1897, + "step": 306 + }, + { + "epoch": 0.00307, + "grad_norm": 0.5740516870647188, + "learning_rate": 0.000921, + "loss": 6.1847, + "step": 307 + }, + { + "epoch": 0.00308, + "grad_norm": 0.5214022662741855, + "learning_rate": 0.000924, + "loss": 6.1668, + "step": 308 + }, + { + "epoch": 0.00309, + "grad_norm": 0.489157506496739, + "learning_rate": 0.000927, + "loss": 6.1798, + "step": 309 + }, + { + "epoch": 0.0031, + "grad_norm": 0.4872945232166538, + "learning_rate": 0.00093, + "loss": 6.1622, + "step": 310 + }, + { + "epoch": 0.00311, + "grad_norm": 0.4909949625440354, + "learning_rate": 0.000933, + "loss": 6.1533, + "step": 311 + }, + { + "epoch": 0.00312, + "grad_norm": 0.4186129744309998, + "learning_rate": 0.000936, + "loss": 6.1314, + "step": 312 + }, + { + "epoch": 0.00313, + "grad_norm": 0.36050967020968366, + "learning_rate": 0.0009390000000000001, + "loss": 6.1442, + "step": 313 + }, + { + "epoch": 0.00314, + "grad_norm": 0.3818285660239077, + "learning_rate": 0.000942, + "loss": 6.1495, + "step": 314 + }, + { + "epoch": 0.00315, + "grad_norm": 0.42967169925093956, + "learning_rate": 0.000945, + "loss": 6.126, + "step": 315 + }, + { + "epoch": 0.00316, + "grad_norm": 0.46511434454587514, + "learning_rate": 0.0009480000000000001, + "loss": 6.1004, + "step": 316 + }, + { + "epoch": 0.00317, + "grad_norm": 0.5237888199450732, + "learning_rate": 0.000951, + "loss": 6.1201, + "step": 317 + }, + { + "epoch": 0.00318, + "grad_norm": 0.6610672935792641, + "learning_rate": 0.000954, + "loss": 6.1161, + "step": 318 + }, + { + "epoch": 0.00319, + "grad_norm": 0.9099491192879063, + "learning_rate": 0.0009570000000000001, + "loss": 6.1122, + "step": 319 + }, + { + "epoch": 0.0032, + "grad_norm": 1.0329714723925014, + "learning_rate": 0.00096, + "loss": 6.1198, + "step": 320 + }, + { + "epoch": 0.00321, + "grad_norm": 0.9944509511152075, + "learning_rate": 0.000963, + "loss": 6.1077, + "step": 321 + }, + { + "epoch": 0.00322, + "grad_norm": 1.41191394849347, + "learning_rate": 0.0009660000000000001, + "loss": 6.1101, + "step": 322 + }, + { + "epoch": 0.00323, + "grad_norm": 0.9172707652477707, + "learning_rate": 0.000969, + "loss": 6.097, + "step": 323 + }, + { + "epoch": 0.00324, + "grad_norm": 1.1261423941310122, + "learning_rate": 0.0009720000000000001, + "loss": 6.1132, + "step": 324 + }, + { + "epoch": 0.00325, + "grad_norm": 0.6623590740718236, + "learning_rate": 0.0009750000000000001, + "loss": 6.0626, + "step": 325 + }, + { + "epoch": 0.00326, + "grad_norm": 0.7364273563271467, + "learning_rate": 0.0009780000000000001, + "loss": 6.0809, + "step": 326 + }, + { + "epoch": 0.00327, + "grad_norm": 0.8106867198528734, + "learning_rate": 0.000981, + "loss": 6.0853, + "step": 327 + }, + { + "epoch": 0.00328, + "grad_norm": 0.9008187294951384, + "learning_rate": 0.000984, + "loss": 6.0637, + "step": 328 + }, + { + "epoch": 0.00329, + "grad_norm": 1.0311811602663732, + "learning_rate": 0.000987, + "loss": 6.0736, + "step": 329 + }, + { + "epoch": 0.0033, + "grad_norm": 0.7413155368855245, + "learning_rate": 0.00099, + "loss": 6.0572, + "step": 330 + }, + { + "epoch": 0.00331, + "grad_norm": 0.6745658849207387, + "learning_rate": 0.0009930000000000002, + "loss": 6.0599, + "step": 331 + }, + { + "epoch": 0.00332, + "grad_norm": 0.5913240343902441, + "learning_rate": 0.0009960000000000001, + "loss": 6.041, + "step": 332 + }, + { + "epoch": 0.00333, + "grad_norm": 0.5668749800176679, + "learning_rate": 0.000999, + "loss": 6.025, + "step": 333 + }, + { + "epoch": 0.00334, + "grad_norm": 0.5007608052342689, + "learning_rate": 0.001002, + "loss": 6.0336, + "step": 334 + }, + { + "epoch": 0.00335, + "grad_norm": 0.3983861566645405, + "learning_rate": 0.001005, + "loss": 6.0284, + "step": 335 + }, + { + "epoch": 0.00336, + "grad_norm": 0.4274260388302738, + "learning_rate": 0.001008, + "loss": 6.0181, + "step": 336 + }, + { + "epoch": 0.00337, + "grad_norm": 0.5335498119421307, + "learning_rate": 0.0010110000000000002, + "loss": 6.0004, + "step": 337 + }, + { + "epoch": 0.00338, + "grad_norm": 0.4831054617031532, + "learning_rate": 0.0010140000000000001, + "loss": 6.0126, + "step": 338 + }, + { + "epoch": 0.00339, + "grad_norm": 0.5437142240239534, + "learning_rate": 0.0010170000000000001, + "loss": 6.0197, + "step": 339 + }, + { + "epoch": 0.0034, + "grad_norm": 0.5265472037464025, + "learning_rate": 0.00102, + "loss": 5.9884, + "step": 340 + }, + { + "epoch": 0.00341, + "grad_norm": 0.4912689865797111, + "learning_rate": 0.001023, + "loss": 5.9692, + "step": 341 + }, + { + "epoch": 0.00342, + "grad_norm": 0.42997171211054086, + "learning_rate": 0.001026, + "loss": 5.974, + "step": 342 + }, + { + "epoch": 0.00343, + "grad_norm": 0.5197303002983154, + "learning_rate": 0.0010290000000000002, + "loss": 5.9763, + "step": 343 + }, + { + "epoch": 0.00344, + "grad_norm": 0.8117900330313431, + "learning_rate": 0.001032, + "loss": 5.9747, + "step": 344 + }, + { + "epoch": 0.00345, + "grad_norm": 1.1753813945983669, + "learning_rate": 0.001035, + "loss": 5.9788, + "step": 345 + }, + { + "epoch": 0.00346, + "grad_norm": 0.8135676081857764, + "learning_rate": 0.0010379999999999999, + "loss": 5.9748, + "step": 346 + }, + { + "epoch": 0.00347, + "grad_norm": 1.0182912247404574, + "learning_rate": 0.001041, + "loss": 5.9557, + "step": 347 + }, + { + "epoch": 0.00348, + "grad_norm": 1.1407576555241683, + "learning_rate": 0.001044, + "loss": 5.978, + "step": 348 + }, + { + "epoch": 0.00349, + "grad_norm": 0.7853068136249622, + "learning_rate": 0.001047, + "loss": 5.9412, + "step": 349 + }, + { + "epoch": 0.0035, + "grad_norm": 1.0427704318540805, + "learning_rate": 0.00105, + "loss": 5.9779, + "step": 350 + }, + { + "epoch": 0.00351, + "grad_norm": 0.8821399606009466, + "learning_rate": 0.001053, + "loss": 5.9701, + "step": 351 + }, + { + "epoch": 0.00352, + "grad_norm": 0.9582157894617032, + "learning_rate": 0.0010559999999999999, + "loss": 5.955, + "step": 352 + }, + { + "epoch": 0.00353, + "grad_norm": 1.0526665256553966, + "learning_rate": 0.001059, + "loss": 5.958, + "step": 353 + }, + { + "epoch": 0.00354, + "grad_norm": 1.045275747166985, + "learning_rate": 0.001062, + "loss": 5.9353, + "step": 354 + }, + { + "epoch": 0.00355, + "grad_norm": 1.1505195376317356, + "learning_rate": 0.001065, + "loss": 5.9542, + "step": 355 + }, + { + "epoch": 0.00356, + "grad_norm": 1.0355197503433216, + "learning_rate": 0.001068, + "loss": 5.9425, + "step": 356 + }, + { + "epoch": 0.00357, + "grad_norm": 1.127934795973434, + "learning_rate": 0.001071, + "loss": 5.9396, + "step": 357 + }, + { + "epoch": 0.00358, + "grad_norm": 0.7430425972675007, + "learning_rate": 0.001074, + "loss": 5.9201, + "step": 358 + }, + { + "epoch": 0.00359, + "grad_norm": 0.6597065121039739, + "learning_rate": 0.001077, + "loss": 5.9099, + "step": 359 + }, + { + "epoch": 0.0036, + "grad_norm": 0.6034653307534226, + "learning_rate": 0.00108, + "loss": 5.9081, + "step": 360 + }, + { + "epoch": 0.00361, + "grad_norm": 0.5960018023982208, + "learning_rate": 0.001083, + "loss": 5.9047, + "step": 361 + }, + { + "epoch": 0.00362, + "grad_norm": 0.4563634882449727, + "learning_rate": 0.001086, + "loss": 5.884, + "step": 362 + }, + { + "epoch": 0.00363, + "grad_norm": 0.49274399902142996, + "learning_rate": 0.001089, + "loss": 5.8759, + "step": 363 + }, + { + "epoch": 0.00364, + "grad_norm": 0.4937234603270663, + "learning_rate": 0.001092, + "loss": 5.8901, + "step": 364 + }, + { + "epoch": 0.00365, + "grad_norm": 0.5102012627619638, + "learning_rate": 0.001095, + "loss": 5.888, + "step": 365 + }, + { + "epoch": 0.00366, + "grad_norm": 0.4676595798467989, + "learning_rate": 0.001098, + "loss": 5.862, + "step": 366 + }, + { + "epoch": 0.00367, + "grad_norm": 0.49526135096535867, + "learning_rate": 0.001101, + "loss": 5.8667, + "step": 367 + }, + { + "epoch": 0.00368, + "grad_norm": 0.47887378181150303, + "learning_rate": 0.001104, + "loss": 5.8643, + "step": 368 + }, + { + "epoch": 0.00369, + "grad_norm": 0.48887117156741833, + "learning_rate": 0.001107, + "loss": 5.8686, + "step": 369 + }, + { + "epoch": 0.0037, + "grad_norm": 0.4473709149836047, + "learning_rate": 0.00111, + "loss": 5.8472, + "step": 370 + }, + { + "epoch": 0.00371, + "grad_norm": 0.38589559577094035, + "learning_rate": 0.001113, + "loss": 5.8158, + "step": 371 + }, + { + "epoch": 0.00372, + "grad_norm": 0.3912315505838062, + "learning_rate": 0.001116, + "loss": 5.8379, + "step": 372 + }, + { + "epoch": 0.00373, + "grad_norm": 0.38616823047071297, + "learning_rate": 0.001119, + "loss": 5.8267, + "step": 373 + }, + { + "epoch": 0.00374, + "grad_norm": 0.45854090440574513, + "learning_rate": 0.001122, + "loss": 5.8316, + "step": 374 + }, + { + "epoch": 0.00375, + "grad_norm": 0.5169440196993219, + "learning_rate": 0.0011250000000000001, + "loss": 5.8332, + "step": 375 + }, + { + "epoch": 0.00376, + "grad_norm": 0.5067806568705457, + "learning_rate": 0.001128, + "loss": 5.8287, + "step": 376 + }, + { + "epoch": 0.00377, + "grad_norm": 0.48558945502532774, + "learning_rate": 0.001131, + "loss": 5.8236, + "step": 377 + }, + { + "epoch": 0.00378, + "grad_norm": 0.47384141098896654, + "learning_rate": 0.001134, + "loss": 5.8187, + "step": 378 + }, + { + "epoch": 0.00379, + "grad_norm": 0.5705731390544022, + "learning_rate": 0.001137, + "loss": 5.8065, + "step": 379 + }, + { + "epoch": 0.0038, + "grad_norm": 0.8415616570321116, + "learning_rate": 0.00114, + "loss": 5.8323, + "step": 380 + }, + { + "epoch": 0.00381, + "grad_norm": 1.152388235651458, + "learning_rate": 0.0011430000000000001, + "loss": 5.8155, + "step": 381 + }, + { + "epoch": 0.00382, + "grad_norm": 0.7784536663385624, + "learning_rate": 0.001146, + "loss": 5.7896, + "step": 382 + }, + { + "epoch": 0.00383, + "grad_norm": 1.2096458575940454, + "learning_rate": 0.001149, + "loss": 5.8132, + "step": 383 + }, + { + "epoch": 0.00384, + "grad_norm": 1.2032626959449177, + "learning_rate": 0.001152, + "loss": 5.8295, + "step": 384 + }, + { + "epoch": 0.00385, + "grad_norm": 1.2258405640081835, + "learning_rate": 0.001155, + "loss": 5.8193, + "step": 385 + }, + { + "epoch": 0.00386, + "grad_norm": 1.060557976067675, + "learning_rate": 0.001158, + "loss": 5.8182, + "step": 386 + }, + { + "epoch": 0.00387, + "grad_norm": 1.6852101829047932, + "learning_rate": 0.0011610000000000001, + "loss": 5.8306, + "step": 387 + }, + { + "epoch": 0.00388, + "grad_norm": 0.7125426173667109, + "learning_rate": 0.001164, + "loss": 5.7875, + "step": 388 + }, + { + "epoch": 0.00389, + "grad_norm": 0.9333298966305301, + "learning_rate": 0.001167, + "loss": 5.8092, + "step": 389 + }, + { + "epoch": 0.0039, + "grad_norm": 0.7871116791575423, + "learning_rate": 0.00117, + "loss": 5.7842, + "step": 390 + }, + { + "epoch": 0.00391, + "grad_norm": 0.9033950769229127, + "learning_rate": 0.001173, + "loss": 5.7945, + "step": 391 + }, + { + "epoch": 0.00392, + "grad_norm": 1.0985861295177402, + "learning_rate": 0.001176, + "loss": 5.8091, + "step": 392 + }, + { + "epoch": 0.00393, + "grad_norm": 0.9893983760666882, + "learning_rate": 0.0011790000000000001, + "loss": 5.787, + "step": 393 + }, + { + "epoch": 0.00394, + "grad_norm": 1.0087630537900902, + "learning_rate": 0.001182, + "loss": 5.7718, + "step": 394 + }, + { + "epoch": 0.00395, + "grad_norm": 0.9357634093540522, + "learning_rate": 0.001185, + "loss": 5.7577, + "step": 395 + }, + { + "epoch": 0.00396, + "grad_norm": 0.8613606742928634, + "learning_rate": 0.001188, + "loss": 5.7674, + "step": 396 + }, + { + "epoch": 0.00397, + "grad_norm": 0.9393680367248612, + "learning_rate": 0.001191, + "loss": 5.7666, + "step": 397 + }, + { + "epoch": 0.00398, + "grad_norm": 0.8380984764873387, + "learning_rate": 0.0011940000000000002, + "loss": 5.7669, + "step": 398 + }, + { + "epoch": 0.00399, + "grad_norm": 0.7495495962771003, + "learning_rate": 0.0011970000000000001, + "loss": 5.7689, + "step": 399 + }, + { + "epoch": 0.004, + "grad_norm": 0.6237821646680863, + "learning_rate": 0.0012000000000000001, + "loss": 5.751, + "step": 400 + }, + { + "epoch": 0.00401, + "grad_norm": 0.6042562364668606, + "learning_rate": 0.001203, + "loss": 5.7286, + "step": 401 + }, + { + "epoch": 0.00402, + "grad_norm": 0.6800421237430357, + "learning_rate": 0.001206, + "loss": 5.7387, + "step": 402 + }, + { + "epoch": 0.00403, + "grad_norm": 0.5349967773183291, + "learning_rate": 0.001209, + "loss": 5.7296, + "step": 403 + }, + { + "epoch": 0.00404, + "grad_norm": 0.4491885962138907, + "learning_rate": 0.0012120000000000002, + "loss": 5.7204, + "step": 404 + }, + { + "epoch": 0.00405, + "grad_norm": 0.4231295613571548, + "learning_rate": 0.0012150000000000002, + "loss": 5.7133, + "step": 405 + }, + { + "epoch": 0.00406, + "grad_norm": 0.4344671315280792, + "learning_rate": 0.0012180000000000001, + "loss": 5.7007, + "step": 406 + }, + { + "epoch": 0.00407, + "grad_norm": 0.3917681066216858, + "learning_rate": 0.0012209999999999999, + "loss": 5.6947, + "step": 407 + }, + { + "epoch": 0.00408, + "grad_norm": 0.3526446903520604, + "learning_rate": 0.001224, + "loss": 5.7113, + "step": 408 + }, + { + "epoch": 0.00409, + "grad_norm": 0.3601774169994176, + "learning_rate": 0.001227, + "loss": 5.689, + "step": 409 + }, + { + "epoch": 0.0041, + "grad_norm": 0.3999316895065895, + "learning_rate": 0.00123, + "loss": 5.6821, + "step": 410 + }, + { + "epoch": 0.00411, + "grad_norm": 0.5215468120681382, + "learning_rate": 0.001233, + "loss": 5.6975, + "step": 411 + }, + { + "epoch": 0.00412, + "grad_norm": 0.8731356141140694, + "learning_rate": 0.001236, + "loss": 5.697, + "step": 412 + }, + { + "epoch": 0.00413, + "grad_norm": 1.2920248463477522, + "learning_rate": 0.0012389999999999999, + "loss": 5.7158, + "step": 413 + }, + { + "epoch": 0.00414, + "grad_norm": 0.7474803494460109, + "learning_rate": 0.001242, + "loss": 5.6771, + "step": 414 + }, + { + "epoch": 0.00415, + "grad_norm": 0.9736431117993121, + "learning_rate": 0.001245, + "loss": 5.6888, + "step": 415 + }, + { + "epoch": 0.00416, + "grad_norm": 0.8653333577780613, + "learning_rate": 0.001248, + "loss": 5.6728, + "step": 416 + }, + { + "epoch": 0.00417, + "grad_norm": 0.6891363999339204, + "learning_rate": 0.001251, + "loss": 5.695, + "step": 417 + }, + { + "epoch": 0.00418, + "grad_norm": 0.7955125411502495, + "learning_rate": 0.001254, + "loss": 5.6734, + "step": 418 + }, + { + "epoch": 0.00419, + "grad_norm": 0.8034523576562718, + "learning_rate": 0.0012569999999999999, + "loss": 5.6601, + "step": 419 + }, + { + "epoch": 0.0042, + "grad_norm": 0.7731586474207807, + "learning_rate": 0.00126, + "loss": 5.6748, + "step": 420 + }, + { + "epoch": 0.00421, + "grad_norm": 0.747486262420627, + "learning_rate": 0.001263, + "loss": 5.6666, + "step": 421 + }, + { + "epoch": 0.00422, + "grad_norm": 0.7917981329409665, + "learning_rate": 0.001266, + "loss": 5.6544, + "step": 422 + }, + { + "epoch": 0.00423, + "grad_norm": 1.0889555078416353, + "learning_rate": 0.001269, + "loss": 5.6655, + "step": 423 + }, + { + "epoch": 0.00424, + "grad_norm": 0.9654337501414605, + "learning_rate": 0.001272, + "loss": 5.6614, + "step": 424 + }, + { + "epoch": 0.00425, + "grad_norm": 0.9055610792467201, + "learning_rate": 0.001275, + "loss": 5.6624, + "step": 425 + }, + { + "epoch": 0.00426, + "grad_norm": 0.8212981627676188, + "learning_rate": 0.001278, + "loss": 5.6637, + "step": 426 + }, + { + "epoch": 0.00427, + "grad_norm": 0.7602647353125763, + "learning_rate": 0.001281, + "loss": 5.6467, + "step": 427 + }, + { + "epoch": 0.00428, + "grad_norm": 0.677777266675102, + "learning_rate": 0.001284, + "loss": 5.6204, + "step": 428 + }, + { + "epoch": 0.00429, + "grad_norm": 0.5947091658499406, + "learning_rate": 0.001287, + "loss": 5.6311, + "step": 429 + }, + { + "epoch": 0.0043, + "grad_norm": 0.6377204770277832, + "learning_rate": 0.00129, + "loss": 5.6309, + "step": 430 + }, + { + "epoch": 0.00431, + "grad_norm": 0.6897719280155576, + "learning_rate": 0.001293, + "loss": 5.6193, + "step": 431 + }, + { + "epoch": 0.00432, + "grad_norm": 0.6884919593361081, + "learning_rate": 0.001296, + "loss": 5.6258, + "step": 432 + }, + { + "epoch": 0.00433, + "grad_norm": 0.6913913571918432, + "learning_rate": 0.001299, + "loss": 5.6177, + "step": 433 + }, + { + "epoch": 0.00434, + "grad_norm": 0.7261280979587743, + "learning_rate": 0.001302, + "loss": 5.6176, + "step": 434 + }, + { + "epoch": 0.00435, + "grad_norm": 0.8547702731757605, + "learning_rate": 0.001305, + "loss": 5.6162, + "step": 435 + }, + { + "epoch": 0.00436, + "grad_norm": 0.9457491419795808, + "learning_rate": 0.001308, + "loss": 5.5986, + "step": 436 + }, + { + "epoch": 0.00437, + "grad_norm": 0.9092672289397813, + "learning_rate": 0.001311, + "loss": 5.6144, + "step": 437 + }, + { + "epoch": 0.00438, + "grad_norm": 0.9049337850080227, + "learning_rate": 0.001314, + "loss": 5.6026, + "step": 438 + }, + { + "epoch": 0.00439, + "grad_norm": 0.7237349559204094, + "learning_rate": 0.001317, + "loss": 5.622, + "step": 439 + }, + { + "epoch": 0.0044, + "grad_norm": 0.8693791239531735, + "learning_rate": 0.00132, + "loss": 5.6008, + "step": 440 + }, + { + "epoch": 0.00441, + "grad_norm": 0.8508838859779835, + "learning_rate": 0.001323, + "loss": 5.5985, + "step": 441 + }, + { + "epoch": 0.00442, + "grad_norm": 0.6987140204651114, + "learning_rate": 0.0013260000000000001, + "loss": 5.587, + "step": 442 + }, + { + "epoch": 0.00443, + "grad_norm": 0.5719525863328404, + "learning_rate": 0.001329, + "loss": 5.5843, + "step": 443 + }, + { + "epoch": 0.00444, + "grad_norm": 0.5407139364493208, + "learning_rate": 0.001332, + "loss": 5.5841, + "step": 444 + }, + { + "epoch": 0.00445, + "grad_norm": 0.5170533332401992, + "learning_rate": 0.001335, + "loss": 5.5667, + "step": 445 + }, + { + "epoch": 0.00446, + "grad_norm": 0.43806698904849195, + "learning_rate": 0.001338, + "loss": 5.5666, + "step": 446 + }, + { + "epoch": 0.00447, + "grad_norm": 0.49048920433285326, + "learning_rate": 0.001341, + "loss": 5.5671, + "step": 447 + }, + { + "epoch": 0.00448, + "grad_norm": 0.46215050883864656, + "learning_rate": 0.0013440000000000001, + "loss": 5.5475, + "step": 448 + }, + { + "epoch": 0.00449, + "grad_norm": 0.5259389714982564, + "learning_rate": 0.001347, + "loss": 5.5523, + "step": 449 + }, + { + "epoch": 0.0045, + "grad_norm": 0.6261840891481112, + "learning_rate": 0.00135, + "loss": 5.5542, + "step": 450 + }, + { + "epoch": 0.00451, + "grad_norm": 0.659112753346069, + "learning_rate": 0.001353, + "loss": 5.5431, + "step": 451 + }, + { + "epoch": 0.00452, + "grad_norm": 0.6800042712218282, + "learning_rate": 0.001356, + "loss": 5.5311, + "step": 452 + }, + { + "epoch": 0.00453, + "grad_norm": 0.5745322110996829, + "learning_rate": 0.001359, + "loss": 5.5188, + "step": 453 + }, + { + "epoch": 0.00454, + "grad_norm": 0.5230006416449293, + "learning_rate": 0.0013620000000000001, + "loss": 5.5319, + "step": 454 + }, + { + "epoch": 0.00455, + "grad_norm": 0.5858648520183006, + "learning_rate": 0.0013650000000000001, + "loss": 5.5314, + "step": 455 + }, + { + "epoch": 0.00456, + "grad_norm": 0.5800568870191161, + "learning_rate": 0.001368, + "loss": 5.5269, + "step": 456 + }, + { + "epoch": 0.00457, + "grad_norm": 0.5535894536098482, + "learning_rate": 0.001371, + "loss": 5.5162, + "step": 457 + }, + { + "epoch": 0.00458, + "grad_norm": 0.56095957103827, + "learning_rate": 0.001374, + "loss": 5.5224, + "step": 458 + }, + { + "epoch": 0.00459, + "grad_norm": 0.7492551931077938, + "learning_rate": 0.0013770000000000002, + "loss": 5.517, + "step": 459 + }, + { + "epoch": 0.0046, + "grad_norm": 0.8251083608050601, + "learning_rate": 0.0013800000000000002, + "loss": 5.5084, + "step": 460 + }, + { + "epoch": 0.00461, + "grad_norm": 0.7810512714683711, + "learning_rate": 0.0013830000000000001, + "loss": 5.5038, + "step": 461 + }, + { + "epoch": 0.00462, + "grad_norm": 0.8065032793416945, + "learning_rate": 0.001386, + "loss": 5.5174, + "step": 462 + }, + { + "epoch": 0.00463, + "grad_norm": 1.0894770209329594, + "learning_rate": 0.001389, + "loss": 5.5013, + "step": 463 + }, + { + "epoch": 0.00464, + "grad_norm": 1.3225439160647088, + "learning_rate": 0.001392, + "loss": 5.5348, + "step": 464 + }, + { + "epoch": 0.00465, + "grad_norm": 1.0604196603169807, + "learning_rate": 0.0013950000000000002, + "loss": 5.4973, + "step": 465 + }, + { + "epoch": 0.00466, + "grad_norm": 1.279638390325454, + "learning_rate": 0.0013980000000000002, + "loss": 5.5365, + "step": 466 + }, + { + "epoch": 0.00467, + "grad_norm": 0.8376473861337255, + "learning_rate": 0.0014010000000000001, + "loss": 5.5034, + "step": 467 + }, + { + "epoch": 0.00468, + "grad_norm": 0.8787509291075707, + "learning_rate": 0.001404, + "loss": 5.4981, + "step": 468 + }, + { + "epoch": 0.00469, + "grad_norm": 0.8315105482947757, + "learning_rate": 0.001407, + "loss": 5.4995, + "step": 469 + }, + { + "epoch": 0.0047, + "grad_norm": 0.900233910053011, + "learning_rate": 0.00141, + "loss": 5.5098, + "step": 470 + }, + { + "epoch": 0.00471, + "grad_norm": 1.1782268624389831, + "learning_rate": 0.001413, + "loss": 5.5031, + "step": 471 + }, + { + "epoch": 0.00472, + "grad_norm": 0.8433457613569132, + "learning_rate": 0.001416, + "loss": 5.4989, + "step": 472 + }, + { + "epoch": 0.00473, + "grad_norm": 0.8984284318795871, + "learning_rate": 0.001419, + "loss": 5.5107, + "step": 473 + }, + { + "epoch": 0.00474, + "grad_norm": 0.8057807296189134, + "learning_rate": 0.0014219999999999999, + "loss": 5.4892, + "step": 474 + }, + { + "epoch": 0.00475, + "grad_norm": 0.8485388443307728, + "learning_rate": 0.001425, + "loss": 5.4826, + "step": 475 + }, + { + "epoch": 0.00476, + "grad_norm": 0.9809665505076786, + "learning_rate": 0.001428, + "loss": 5.5192, + "step": 476 + }, + { + "epoch": 0.00477, + "grad_norm": 0.9686040040277449, + "learning_rate": 0.001431, + "loss": 5.4785, + "step": 477 + }, + { + "epoch": 0.00478, + "grad_norm": 0.8580634771679295, + "learning_rate": 0.001434, + "loss": 5.4949, + "step": 478 + }, + { + "epoch": 0.00479, + "grad_norm": 0.9699411511566143, + "learning_rate": 0.001437, + "loss": 5.4782, + "step": 479 + }, + { + "epoch": 0.0048, + "grad_norm": 0.8190893004419723, + "learning_rate": 0.0014399999999999999, + "loss": 5.4711, + "step": 480 + }, + { + "epoch": 0.00481, + "grad_norm": 0.7019568417012634, + "learning_rate": 0.001443, + "loss": 5.4711, + "step": 481 + }, + { + "epoch": 0.00482, + "grad_norm": 0.6677085458766991, + "learning_rate": 0.001446, + "loss": 5.4413, + "step": 482 + }, + { + "epoch": 0.00483, + "grad_norm": 0.6622223138809283, + "learning_rate": 0.001449, + "loss": 5.4499, + "step": 483 + }, + { + "epoch": 0.00484, + "grad_norm": 0.6831178312322733, + "learning_rate": 0.001452, + "loss": 5.4529, + "step": 484 + }, + { + "epoch": 0.00485, + "grad_norm": 0.6786720110326826, + "learning_rate": 0.001455, + "loss": 5.4548, + "step": 485 + }, + { + "epoch": 0.00486, + "grad_norm": 0.639008514866701, + "learning_rate": 0.001458, + "loss": 5.4237, + "step": 486 + }, + { + "epoch": 0.00487, + "grad_norm": 0.6663550617928226, + "learning_rate": 0.001461, + "loss": 5.4384, + "step": 487 + }, + { + "epoch": 0.00488, + "grad_norm": 0.5492133897414355, + "learning_rate": 0.001464, + "loss": 5.4132, + "step": 488 + }, + { + "epoch": 0.00489, + "grad_norm": 0.5801507624750007, + "learning_rate": 0.001467, + "loss": 5.4021, + "step": 489 + }, + { + "epoch": 0.0049, + "grad_norm": 0.7369258331072537, + "learning_rate": 0.00147, + "loss": 5.4206, + "step": 490 + }, + { + "epoch": 0.00491, + "grad_norm": 0.8149385883601376, + "learning_rate": 0.001473, + "loss": 5.4211, + "step": 491 + }, + { + "epoch": 0.00492, + "grad_norm": 0.7605903135404127, + "learning_rate": 0.001476, + "loss": 5.4167, + "step": 492 + }, + { + "epoch": 0.00493, + "grad_norm": 0.5930344173404182, + "learning_rate": 0.001479, + "loss": 5.4047, + "step": 493 + }, + { + "epoch": 0.00494, + "grad_norm": 0.7636413305061132, + "learning_rate": 0.001482, + "loss": 5.4167, + "step": 494 + }, + { + "epoch": 0.00495, + "grad_norm": 0.7369014833667976, + "learning_rate": 0.001485, + "loss": 5.4191, + "step": 495 + }, + { + "epoch": 0.00496, + "grad_norm": 0.8504550792082386, + "learning_rate": 0.001488, + "loss": 5.4031, + "step": 496 + }, + { + "epoch": 0.00497, + "grad_norm": 0.8843425860263048, + "learning_rate": 0.001491, + "loss": 5.3921, + "step": 497 + }, + { + "epoch": 0.00498, + "grad_norm": 0.7438750793797253, + "learning_rate": 0.001494, + "loss": 5.4145, + "step": 498 + }, + { + "epoch": 0.00499, + "grad_norm": 0.7036650868069556, + "learning_rate": 0.001497, + "loss": 5.3822, + "step": 499 + }, + { + "epoch": 0.005, + "grad_norm": 0.7877256477029045, + "learning_rate": 0.0015, + "loss": 5.3896, + "step": 500 + }, + { + "epoch": 0.00501, + "grad_norm": 0.7990985096145019, + "learning_rate": 0.001503, + "loss": 5.3912, + "step": 501 + }, + { + "epoch": 0.00502, + "grad_norm": 0.5932348165440957, + "learning_rate": 0.001506, + "loss": 5.3876, + "step": 502 + }, + { + "epoch": 0.00503, + "grad_norm": 0.6328380348360387, + "learning_rate": 0.0015090000000000001, + "loss": 5.391, + "step": 503 + }, + { + "epoch": 0.00504, + "grad_norm": 0.5819727032922326, + "learning_rate": 0.001512, + "loss": 5.3693, + "step": 504 + }, + { + "epoch": 0.00505, + "grad_norm": 0.5953710061568833, + "learning_rate": 0.001515, + "loss": 5.3594, + "step": 505 + }, + { + "epoch": 0.00506, + "grad_norm": 0.5986845177656173, + "learning_rate": 0.001518, + "loss": 5.3624, + "step": 506 + }, + { + "epoch": 0.00507, + "grad_norm": 0.623690249743195, + "learning_rate": 0.001521, + "loss": 5.3571, + "step": 507 + }, + { + "epoch": 0.00508, + "grad_norm": 0.653996676321799, + "learning_rate": 0.001524, + "loss": 5.3588, + "step": 508 + }, + { + "epoch": 0.00509, + "grad_norm": 0.7417086851753733, + "learning_rate": 0.0015270000000000001, + "loss": 5.3422, + "step": 509 + }, + { + "epoch": 0.0051, + "grad_norm": 0.7033408638361137, + "learning_rate": 0.0015300000000000001, + "loss": 5.3598, + "step": 510 + }, + { + "epoch": 0.00511, + "grad_norm": 0.7013752626190988, + "learning_rate": 0.001533, + "loss": 5.3361, + "step": 511 + }, + { + "epoch": 0.00512, + "grad_norm": 0.7403626060663853, + "learning_rate": 0.001536, + "loss": 5.3344, + "step": 512 + }, + { + "epoch": 0.00513, + "grad_norm": 0.7668257914395668, + "learning_rate": 0.001539, + "loss": 5.3488, + "step": 513 + }, + { + "epoch": 0.00514, + "grad_norm": 0.8677889222141009, + "learning_rate": 0.001542, + "loss": 5.3327, + "step": 514 + }, + { + "epoch": 0.00515, + "grad_norm": 0.896065553430359, + "learning_rate": 0.0015450000000000001, + "loss": 5.3422, + "step": 515 + }, + { + "epoch": 0.00516, + "grad_norm": 1.0837566571017694, + "learning_rate": 0.0015480000000000001, + "loss": 5.3497, + "step": 516 + }, + { + "epoch": 0.00517, + "grad_norm": 0.8071431981996826, + "learning_rate": 0.001551, + "loss": 5.3323, + "step": 517 + }, + { + "epoch": 0.00518, + "grad_norm": 0.7918860105262308, + "learning_rate": 0.001554, + "loss": 5.3156, + "step": 518 + }, + { + "epoch": 0.00519, + "grad_norm": 0.7777992304037674, + "learning_rate": 0.001557, + "loss": 5.3213, + "step": 519 + }, + { + "epoch": 0.0052, + "grad_norm": 0.8275508154311308, + "learning_rate": 0.0015600000000000002, + "loss": 5.3297, + "step": 520 + }, + { + "epoch": 0.00521, + "grad_norm": 1.081326682572488, + "learning_rate": 0.0015630000000000002, + "loss": 5.3161, + "step": 521 + }, + { + "epoch": 0.00522, + "grad_norm": 1.0769033841173197, + "learning_rate": 0.0015660000000000001, + "loss": 5.3235, + "step": 522 + }, + { + "epoch": 0.00523, + "grad_norm": 1.1142920349652348, + "learning_rate": 0.001569, + "loss": 5.3418, + "step": 523 + }, + { + "epoch": 0.00524, + "grad_norm": 0.9680819380144772, + "learning_rate": 0.001572, + "loss": 5.3387, + "step": 524 + }, + { + "epoch": 0.00525, + "grad_norm": 1.042843464512002, + "learning_rate": 0.001575, + "loss": 5.3364, + "step": 525 + }, + { + "epoch": 0.00526, + "grad_norm": 0.8760110028730904, + "learning_rate": 0.0015780000000000002, + "loss": 5.3079, + "step": 526 + }, + { + "epoch": 0.00527, + "grad_norm": 0.7131338611439731, + "learning_rate": 0.0015810000000000002, + "loss": 5.3127, + "step": 527 + }, + { + "epoch": 0.00528, + "grad_norm": 0.6786352126644868, + "learning_rate": 0.0015840000000000001, + "loss": 5.2868, + "step": 528 + }, + { + "epoch": 0.00529, + "grad_norm": 0.6952357562094686, + "learning_rate": 0.001587, + "loss": 5.2922, + "step": 529 + }, + { + "epoch": 0.0053, + "grad_norm": 0.8086799159810172, + "learning_rate": 0.00159, + "loss": 5.3039, + "step": 530 + }, + { + "epoch": 0.00531, + "grad_norm": 0.828973806141186, + "learning_rate": 0.001593, + "loss": 5.2873, + "step": 531 + }, + { + "epoch": 0.00532, + "grad_norm": 0.7467931538676229, + "learning_rate": 0.0015960000000000002, + "loss": 5.2943, + "step": 532 + }, + { + "epoch": 0.00533, + "grad_norm": 0.7141354989500697, + "learning_rate": 0.0015990000000000002, + "loss": 5.2786, + "step": 533 + }, + { + "epoch": 0.00534, + "grad_norm": 0.9764493557723114, + "learning_rate": 0.0016020000000000001, + "loss": 5.2728, + "step": 534 + }, + { + "epoch": 0.00535, + "grad_norm": 1.3880068471110967, + "learning_rate": 0.001605, + "loss": 5.2984, + "step": 535 + }, + { + "epoch": 0.00536, + "grad_norm": 0.9041341459356813, + "learning_rate": 0.001608, + "loss": 5.2855, + "step": 536 + }, + { + "epoch": 0.00537, + "grad_norm": 0.762740194970871, + "learning_rate": 0.0016110000000000002, + "loss": 5.2723, + "step": 537 + }, + { + "epoch": 0.00538, + "grad_norm": 0.787312661332683, + "learning_rate": 0.0016140000000000002, + "loss": 5.2506, + "step": 538 + }, + { + "epoch": 0.00539, + "grad_norm": 0.6102453826005042, + "learning_rate": 0.0016170000000000002, + "loss": 5.2365, + "step": 539 + }, + { + "epoch": 0.0054, + "grad_norm": 0.6664103859218952, + "learning_rate": 0.0016200000000000001, + "loss": 5.2513, + "step": 540 + }, + { + "epoch": 0.00541, + "grad_norm": 0.7228434818484509, + "learning_rate": 0.001623, + "loss": 5.2273, + "step": 541 + }, + { + "epoch": 0.00542, + "grad_norm": 0.9646545444558308, + "learning_rate": 0.001626, + "loss": 5.2641, + "step": 542 + }, + { + "epoch": 0.00543, + "grad_norm": 1.1121220265997553, + "learning_rate": 0.0016290000000000002, + "loss": 5.2329, + "step": 543 + }, + { + "epoch": 0.00544, + "grad_norm": 0.7994777164441184, + "learning_rate": 0.0016320000000000002, + "loss": 5.2404, + "step": 544 + }, + { + "epoch": 0.00545, + "grad_norm": 0.7226008260314222, + "learning_rate": 0.0016350000000000002, + "loss": 5.2461, + "step": 545 + }, + { + "epoch": 0.00546, + "grad_norm": 0.7699535423166085, + "learning_rate": 0.0016380000000000001, + "loss": 5.2193, + "step": 546 + }, + { + "epoch": 0.00547, + "grad_norm": 0.6548240326600666, + "learning_rate": 0.001641, + "loss": 5.2108, + "step": 547 + }, + { + "epoch": 0.00548, + "grad_norm": 0.6332922946851393, + "learning_rate": 0.001644, + "loss": 5.2061, + "step": 548 + }, + { + "epoch": 0.00549, + "grad_norm": 0.6231528959927674, + "learning_rate": 0.0016470000000000002, + "loss": 5.2, + "step": 549 + }, + { + "epoch": 0.0055, + "grad_norm": 0.7419840881368932, + "learning_rate": 0.0016500000000000002, + "loss": 5.2179, + "step": 550 + }, + { + "epoch": 0.00551, + "grad_norm": 0.7180205816820676, + "learning_rate": 0.0016530000000000002, + "loss": 5.2057, + "step": 551 + }, + { + "epoch": 0.00552, + "grad_norm": 0.5920069574731561, + "learning_rate": 0.0016560000000000001, + "loss": 5.1823, + "step": 552 + }, + { + "epoch": 0.00553, + "grad_norm": 0.7996998429214144, + "learning_rate": 0.001659, + "loss": 5.1806, + "step": 553 + }, + { + "epoch": 0.00554, + "grad_norm": 1.0229110500291838, + "learning_rate": 0.0016620000000000003, + "loss": 5.1965, + "step": 554 + }, + { + "epoch": 0.00555, + "grad_norm": 1.1118473608885646, + "learning_rate": 0.0016650000000000002, + "loss": 5.1994, + "step": 555 + }, + { + "epoch": 0.00556, + "grad_norm": 0.9366759039894813, + "learning_rate": 0.0016680000000000002, + "loss": 5.1806, + "step": 556 + }, + { + "epoch": 0.00557, + "grad_norm": 0.9046668934887724, + "learning_rate": 0.0016710000000000002, + "loss": 5.1671, + "step": 557 + }, + { + "epoch": 0.00558, + "grad_norm": 1.142251826676036, + "learning_rate": 0.0016740000000000001, + "loss": 5.2009, + "step": 558 + }, + { + "epoch": 0.00559, + "grad_norm": 1.0520781475504497, + "learning_rate": 0.001677, + "loss": 5.1865, + "step": 559 + }, + { + "epoch": 0.0056, + "grad_norm": 1.0780070897638405, + "learning_rate": 0.0016800000000000003, + "loss": 5.1609, + "step": 560 + }, + { + "epoch": 0.00561, + "grad_norm": 0.8904071170090557, + "learning_rate": 0.0016830000000000003, + "loss": 5.1755, + "step": 561 + }, + { + "epoch": 0.00562, + "grad_norm": 0.8189640026396579, + "learning_rate": 0.0016860000000000002, + "loss": 5.168, + "step": 562 + }, + { + "epoch": 0.00563, + "grad_norm": 0.746495696524217, + "learning_rate": 0.001689, + "loss": 5.1552, + "step": 563 + }, + { + "epoch": 0.00564, + "grad_norm": 0.7249953066463264, + "learning_rate": 0.001692, + "loss": 5.1416, + "step": 564 + }, + { + "epoch": 0.00565, + "grad_norm": 0.6193711615047397, + "learning_rate": 0.001695, + "loss": 5.1336, + "step": 565 + }, + { + "epoch": 0.00566, + "grad_norm": 0.8661212922050541, + "learning_rate": 0.0016979999999999999, + "loss": 5.1381, + "step": 566 + }, + { + "epoch": 0.00567, + "grad_norm": 0.9452019797636565, + "learning_rate": 0.0017009999999999998, + "loss": 5.1333, + "step": 567 + }, + { + "epoch": 0.00568, + "grad_norm": 0.8863756714851743, + "learning_rate": 0.0017039999999999998, + "loss": 5.1455, + "step": 568 + }, + { + "epoch": 0.00569, + "grad_norm": 0.8164512297006329, + "learning_rate": 0.001707, + "loss": 5.1087, + "step": 569 + }, + { + "epoch": 0.0057, + "grad_norm": 0.8055756655780417, + "learning_rate": 0.00171, + "loss": 5.1416, + "step": 570 + }, + { + "epoch": 0.00571, + "grad_norm": 0.9556127682537684, + "learning_rate": 0.001713, + "loss": 5.1421, + "step": 571 + }, + { + "epoch": 0.00572, + "grad_norm": 1.1121438340859977, + "learning_rate": 0.0017159999999999999, + "loss": 5.1242, + "step": 572 + }, + { + "epoch": 0.00573, + "grad_norm": 0.8538691427356556, + "learning_rate": 0.0017189999999999998, + "loss": 5.1261, + "step": 573 + }, + { + "epoch": 0.00574, + "grad_norm": 0.754134808897758, + "learning_rate": 0.001722, + "loss": 5.1186, + "step": 574 + }, + { + "epoch": 0.00575, + "grad_norm": 0.6045959777005846, + "learning_rate": 0.001725, + "loss": 5.0826, + "step": 575 + }, + { + "epoch": 0.00576, + "grad_norm": 0.5849168439848929, + "learning_rate": 0.001728, + "loss": 5.0868, + "step": 576 + }, + { + "epoch": 0.00577, + "grad_norm": 0.5881868482585118, + "learning_rate": 0.001731, + "loss": 5.0984, + "step": 577 + }, + { + "epoch": 0.00578, + "grad_norm": 0.6496481817365951, + "learning_rate": 0.0017339999999999999, + "loss": 5.0795, + "step": 578 + }, + { + "epoch": 0.00579, + "grad_norm": 0.7126042661301508, + "learning_rate": 0.0017369999999999998, + "loss": 5.0702, + "step": 579 + }, + { + "epoch": 0.0058, + "grad_norm": 0.691634070596695, + "learning_rate": 0.00174, + "loss": 5.0826, + "step": 580 + }, + { + "epoch": 0.00581, + "grad_norm": 0.6405819953602082, + "learning_rate": 0.001743, + "loss": 5.0809, + "step": 581 + }, + { + "epoch": 0.00582, + "grad_norm": 0.6144348123489994, + "learning_rate": 0.001746, + "loss": 5.0509, + "step": 582 + }, + { + "epoch": 0.00583, + "grad_norm": 0.5400038424579979, + "learning_rate": 0.001749, + "loss": 5.0752, + "step": 583 + }, + { + "epoch": 0.00584, + "grad_norm": 0.4936797939946059, + "learning_rate": 0.0017519999999999999, + "loss": 5.0634, + "step": 584 + }, + { + "epoch": 0.00585, + "grad_norm": 0.5420757595953297, + "learning_rate": 0.0017549999999999998, + "loss": 5.0509, + "step": 585 + }, + { + "epoch": 0.00586, + "grad_norm": 0.6461298473240921, + "learning_rate": 0.001758, + "loss": 5.0463, + "step": 586 + }, + { + "epoch": 0.00587, + "grad_norm": 1.0127747465457377, + "learning_rate": 0.001761, + "loss": 5.0477, + "step": 587 + }, + { + "epoch": 0.00588, + "grad_norm": 1.312605638466154, + "learning_rate": 0.001764, + "loss": 5.0646, + "step": 588 + }, + { + "epoch": 0.00589, + "grad_norm": 0.7336414507180539, + "learning_rate": 0.001767, + "loss": 5.0246, + "step": 589 + }, + { + "epoch": 0.0059, + "grad_norm": 0.9403709566188089, + "learning_rate": 0.0017699999999999999, + "loss": 5.0344, + "step": 590 + }, + { + "epoch": 0.00591, + "grad_norm": 1.1082528668309917, + "learning_rate": 0.001773, + "loss": 5.0455, + "step": 591 + }, + { + "epoch": 0.00592, + "grad_norm": 1.2840924567627583, + "learning_rate": 0.001776, + "loss": 5.0904, + "step": 592 + }, + { + "epoch": 0.00593, + "grad_norm": 0.7010705426983365, + "learning_rate": 0.001779, + "loss": 5.0507, + "step": 593 + }, + { + "epoch": 0.00594, + "grad_norm": 0.7515579064184676, + "learning_rate": 0.001782, + "loss": 5.0452, + "step": 594 + }, + { + "epoch": 0.00595, + "grad_norm": 0.8237589608472985, + "learning_rate": 0.001785, + "loss": 5.0574, + "step": 595 + }, + { + "epoch": 0.00596, + "grad_norm": 0.7511193245039597, + "learning_rate": 0.0017879999999999999, + "loss": 5.0458, + "step": 596 + }, + { + "epoch": 0.00597, + "grad_norm": 0.6951714106885373, + "learning_rate": 0.001791, + "loss": 5.0059, + "step": 597 + }, + { + "epoch": 0.00598, + "grad_norm": 0.6637745885790589, + "learning_rate": 0.001794, + "loss": 5.0231, + "step": 598 + }, + { + "epoch": 0.00599, + "grad_norm": 0.7127858481763457, + "learning_rate": 0.001797, + "loss": 5.0133, + "step": 599 + }, + { + "epoch": 0.006, + "grad_norm": 0.6761974733345899, + "learning_rate": 0.0018, + "loss": 5.0135, + "step": 600 + }, + { + "epoch": 0.00601, + "grad_norm": 0.6625605364634614, + "learning_rate": 0.001803, + "loss": 5.0015, + "step": 601 + }, + { + "epoch": 0.00602, + "grad_norm": 0.6742478727145375, + "learning_rate": 0.0018059999999999999, + "loss": 4.9862, + "step": 602 + }, + { + "epoch": 0.00603, + "grad_norm": 1.00015732542698, + "learning_rate": 0.001809, + "loss": 5.0193, + "step": 603 + }, + { + "epoch": 0.00604, + "grad_norm": 1.387382192884798, + "learning_rate": 0.001812, + "loss": 5.0251, + "step": 604 + }, + { + "epoch": 0.00605, + "grad_norm": 0.6727404947716551, + "learning_rate": 0.001815, + "loss": 5.0023, + "step": 605 + }, + { + "epoch": 0.00606, + "grad_norm": 0.9044609854709968, + "learning_rate": 0.001818, + "loss": 5.0189, + "step": 606 + }, + { + "epoch": 0.00607, + "grad_norm": 1.08596708759871, + "learning_rate": 0.001821, + "loss": 5.0221, + "step": 607 + }, + { + "epoch": 0.00608, + "grad_norm": 1.0369936566425986, + "learning_rate": 0.001824, + "loss": 5.008, + "step": 608 + }, + { + "epoch": 0.00609, + "grad_norm": 1.0935517991120203, + "learning_rate": 0.001827, + "loss": 5.0109, + "step": 609 + }, + { + "epoch": 0.0061, + "grad_norm": 0.9727711844599547, + "learning_rate": 0.00183, + "loss": 4.9666, + "step": 610 + }, + { + "epoch": 0.00611, + "grad_norm": 0.9492725313696737, + "learning_rate": 0.001833, + "loss": 4.9894, + "step": 611 + }, + { + "epoch": 0.00612, + "grad_norm": 0.845936691035656, + "learning_rate": 0.001836, + "loss": 4.9768, + "step": 612 + }, + { + "epoch": 0.00613, + "grad_norm": 0.917579763764549, + "learning_rate": 0.001839, + "loss": 4.9836, + "step": 613 + }, + { + "epoch": 0.00614, + "grad_norm": 0.8975809320202123, + "learning_rate": 0.001842, + "loss": 5.0024, + "step": 614 + }, + { + "epoch": 0.00615, + "grad_norm": 1.1935315831043936, + "learning_rate": 0.001845, + "loss": 5.0018, + "step": 615 + }, + { + "epoch": 0.00616, + "grad_norm": 0.9948318214992812, + "learning_rate": 0.001848, + "loss": 4.9924, + "step": 616 + }, + { + "epoch": 0.00617, + "grad_norm": 0.8063669388844663, + "learning_rate": 0.001851, + "loss": 4.9919, + "step": 617 + }, + { + "epoch": 0.00618, + "grad_norm": 0.8184910219660716, + "learning_rate": 0.001854, + "loss": 4.9666, + "step": 618 + }, + { + "epoch": 0.00619, + "grad_norm": 0.7780464806882716, + "learning_rate": 0.001857, + "loss": 4.9753, + "step": 619 + }, + { + "epoch": 0.0062, + "grad_norm": 0.7430630101852395, + "learning_rate": 0.00186, + "loss": 4.9566, + "step": 620 + }, + { + "epoch": 0.00621, + "grad_norm": 0.8040699289060931, + "learning_rate": 0.001863, + "loss": 4.9542, + "step": 621 + }, + { + "epoch": 0.00622, + "grad_norm": 0.8423285566803137, + "learning_rate": 0.001866, + "loss": 4.9653, + "step": 622 + }, + { + "epoch": 0.00623, + "grad_norm": 0.6802855865245334, + "learning_rate": 0.001869, + "loss": 4.9365, + "step": 623 + }, + { + "epoch": 0.00624, + "grad_norm": 0.7045868643772514, + "learning_rate": 0.001872, + "loss": 4.9425, + "step": 624 + }, + { + "epoch": 0.00625, + "grad_norm": 0.69605003901388, + "learning_rate": 0.001875, + "loss": 4.9397, + "step": 625 + }, + { + "epoch": 0.00626, + "grad_norm": 0.8788947819856907, + "learning_rate": 0.0018780000000000001, + "loss": 4.9403, + "step": 626 + }, + { + "epoch": 0.00627, + "grad_norm": 0.8580113274469313, + "learning_rate": 0.001881, + "loss": 4.9238, + "step": 627 + }, + { + "epoch": 0.00628, + "grad_norm": 0.7437087045232712, + "learning_rate": 0.001884, + "loss": 4.9553, + "step": 628 + }, + { + "epoch": 0.00629, + "grad_norm": 0.673794469112573, + "learning_rate": 0.001887, + "loss": 4.9059, + "step": 629 + }, + { + "epoch": 0.0063, + "grad_norm": 0.7529443514224647, + "learning_rate": 0.00189, + "loss": 4.9225, + "step": 630 + }, + { + "epoch": 0.00631, + "grad_norm": 0.7882316002133182, + "learning_rate": 0.0018930000000000002, + "loss": 4.9159, + "step": 631 + }, + { + "epoch": 0.00632, + "grad_norm": 0.7345089369079263, + "learning_rate": 0.0018960000000000001, + "loss": 4.9318, + "step": 632 + }, + { + "epoch": 0.00633, + "grad_norm": 0.807557335679046, + "learning_rate": 0.001899, + "loss": 4.9156, + "step": 633 + }, + { + "epoch": 0.00634, + "grad_norm": 0.856273971211143, + "learning_rate": 0.001902, + "loss": 4.9086, + "step": 634 + }, + { + "epoch": 0.00635, + "grad_norm": 0.8041095750030954, + "learning_rate": 0.001905, + "loss": 4.8993, + "step": 635 + }, + { + "epoch": 0.00636, + "grad_norm": 0.8334087326563642, + "learning_rate": 0.001908, + "loss": 4.9117, + "step": 636 + }, + { + "epoch": 0.00637, + "grad_norm": 0.8711627404236827, + "learning_rate": 0.0019110000000000002, + "loss": 4.9242, + "step": 637 + }, + { + "epoch": 0.00638, + "grad_norm": 0.950273886749592, + "learning_rate": 0.0019140000000000001, + "loss": 4.918, + "step": 638 + }, + { + "epoch": 0.00639, + "grad_norm": 0.9763758019156279, + "learning_rate": 0.001917, + "loss": 4.8946, + "step": 639 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9069546349974866, + "learning_rate": 0.00192, + "loss": 4.9049, + "step": 640 + }, + { + "epoch": 0.00641, + "grad_norm": 0.7602914411110755, + "learning_rate": 0.001923, + "loss": 4.8967, + "step": 641 + }, + { + "epoch": 0.00642, + "grad_norm": 0.6358369958975738, + "learning_rate": 0.001926, + "loss": 4.8823, + "step": 642 + }, + { + "epoch": 0.00643, + "grad_norm": 0.6298192488222032, + "learning_rate": 0.0019290000000000002, + "loss": 4.8984, + "step": 643 + }, + { + "epoch": 0.00644, + "grad_norm": 0.5835488483304159, + "learning_rate": 0.0019320000000000001, + "loss": 4.8719, + "step": 644 + }, + { + "epoch": 0.00645, + "grad_norm": 0.509893487039198, + "learning_rate": 0.001935, + "loss": 4.8697, + "step": 645 + }, + { + "epoch": 0.00646, + "grad_norm": 0.5016743009567477, + "learning_rate": 0.001938, + "loss": 4.8822, + "step": 646 + }, + { + "epoch": 0.00647, + "grad_norm": 0.4834671196673339, + "learning_rate": 0.001941, + "loss": 4.8737, + "step": 647 + }, + { + "epoch": 0.00648, + "grad_norm": 0.4900666309904975, + "learning_rate": 0.0019440000000000002, + "loss": 4.8665, + "step": 648 + }, + { + "epoch": 0.00649, + "grad_norm": 0.6242032394190251, + "learning_rate": 0.0019470000000000002, + "loss": 4.8569, + "step": 649 + }, + { + "epoch": 0.0065, + "grad_norm": 0.7946730011730083, + "learning_rate": 0.0019500000000000001, + "loss": 4.8944, + "step": 650 + }, + { + "epoch": 0.00651, + "grad_norm": 0.7813096551019217, + "learning_rate": 0.001953, + "loss": 4.8797, + "step": 651 + }, + { + "epoch": 0.00652, + "grad_norm": 0.5708054833927125, + "learning_rate": 0.0019560000000000003, + "loss": 4.8538, + "step": 652 + }, + { + "epoch": 0.00653, + "grad_norm": 0.6416767177196502, + "learning_rate": 0.0019590000000000002, + "loss": 4.8612, + "step": 653 + }, + { + "epoch": 0.00654, + "grad_norm": 0.6414247107018044, + "learning_rate": 0.001962, + "loss": 4.8324, + "step": 654 + }, + { + "epoch": 0.00655, + "grad_norm": 0.5608777579230684, + "learning_rate": 0.001965, + "loss": 4.8581, + "step": 655 + }, + { + "epoch": 0.00656, + "grad_norm": 0.4812696659686437, + "learning_rate": 0.001968, + "loss": 4.8497, + "step": 656 + }, + { + "epoch": 0.00657, + "grad_norm": 0.5196607021803705, + "learning_rate": 0.001971, + "loss": 4.8212, + "step": 657 + }, + { + "epoch": 0.00658, + "grad_norm": 0.5384007134004025, + "learning_rate": 0.001974, + "loss": 4.8422, + "step": 658 + }, + { + "epoch": 0.00659, + "grad_norm": 0.6084877834513672, + "learning_rate": 0.001977, + "loss": 4.8215, + "step": 659 + }, + { + "epoch": 0.0066, + "grad_norm": 0.7589081819730935, + "learning_rate": 0.00198, + "loss": 4.8434, + "step": 660 + }, + { + "epoch": 0.00661, + "grad_norm": 0.7941713837035096, + "learning_rate": 0.001983, + "loss": 4.8155, + "step": 661 + }, + { + "epoch": 0.00662, + "grad_norm": 0.7978547164974868, + "learning_rate": 0.0019860000000000004, + "loss": 4.8127, + "step": 662 + }, + { + "epoch": 0.00663, + "grad_norm": 0.893378015840618, + "learning_rate": 0.0019890000000000003, + "loss": 4.8265, + "step": 663 + }, + { + "epoch": 0.00664, + "grad_norm": 0.899949440268495, + "learning_rate": 0.0019920000000000003, + "loss": 4.8283, + "step": 664 + }, + { + "epoch": 0.00665, + "grad_norm": 0.8127988991300923, + "learning_rate": 0.0019950000000000002, + "loss": 4.7986, + "step": 665 + }, + { + "epoch": 0.00666, + "grad_norm": 0.9200900241429067, + "learning_rate": 0.001998, + "loss": 4.8226, + "step": 666 + }, + { + "epoch": 0.00667, + "grad_norm": 1.037264674390151, + "learning_rate": 0.002001, + "loss": 4.8324, + "step": 667 + }, + { + "epoch": 0.00668, + "grad_norm": 0.8082146942337904, + "learning_rate": 0.002004, + "loss": 4.8427, + "step": 668 + }, + { + "epoch": 0.00669, + "grad_norm": 0.7033624756486074, + "learning_rate": 0.002007, + "loss": 4.8562, + "step": 669 + }, + { + "epoch": 0.0067, + "grad_norm": 0.751969455636164, + "learning_rate": 0.00201, + "loss": 4.8525, + "step": 670 + }, + { + "epoch": 0.00671, + "grad_norm": 0.736520529365372, + "learning_rate": 0.002013, + "loss": 4.8206, + "step": 671 + }, + { + "epoch": 0.00672, + "grad_norm": 0.7466982774868701, + "learning_rate": 0.002016, + "loss": 4.8129, + "step": 672 + }, + { + "epoch": 0.00673, + "grad_norm": 0.7025220166479262, + "learning_rate": 0.002019, + "loss": 4.8146, + "step": 673 + }, + { + "epoch": 0.00674, + "grad_norm": 0.8461453039283889, + "learning_rate": 0.0020220000000000004, + "loss": 4.8144, + "step": 674 + }, + { + "epoch": 0.00675, + "grad_norm": 0.9399881649158435, + "learning_rate": 0.0020250000000000003, + "loss": 4.8482, + "step": 675 + }, + { + "epoch": 0.00676, + "grad_norm": 0.9357632097468723, + "learning_rate": 0.0020280000000000003, + "loss": 4.8268, + "step": 676 + }, + { + "epoch": 0.00677, + "grad_norm": 0.7758960619033557, + "learning_rate": 0.0020310000000000003, + "loss": 4.8204, + "step": 677 + }, + { + "epoch": 0.00678, + "grad_norm": 0.699292513140664, + "learning_rate": 0.0020340000000000002, + "loss": 4.8248, + "step": 678 + }, + { + "epoch": 0.00679, + "grad_norm": 0.7370787957429817, + "learning_rate": 0.002037, + "loss": 4.816, + "step": 679 + }, + { + "epoch": 0.0068, + "grad_norm": 0.8377547362902558, + "learning_rate": 0.00204, + "loss": 4.8174, + "step": 680 + }, + { + "epoch": 0.00681, + "grad_norm": 0.8259782799224379, + "learning_rate": 0.002043, + "loss": 4.8155, + "step": 681 + }, + { + "epoch": 0.00682, + "grad_norm": 0.7684261091318535, + "learning_rate": 0.002046, + "loss": 4.8082, + "step": 682 + }, + { + "epoch": 0.00683, + "grad_norm": 0.8487367019402318, + "learning_rate": 0.002049, + "loss": 4.7989, + "step": 683 + }, + { + "epoch": 0.00684, + "grad_norm": 0.8838018894616847, + "learning_rate": 0.002052, + "loss": 4.8194, + "step": 684 + }, + { + "epoch": 0.00685, + "grad_norm": 0.8860697203395584, + "learning_rate": 0.0020550000000000004, + "loss": 4.8252, + "step": 685 + }, + { + "epoch": 0.00686, + "grad_norm": 0.7336183086529302, + "learning_rate": 0.0020580000000000004, + "loss": 4.7837, + "step": 686 + }, + { + "epoch": 0.00687, + "grad_norm": 0.8176630379413288, + "learning_rate": 0.0020610000000000003, + "loss": 4.7944, + "step": 687 + }, + { + "epoch": 0.00688, + "grad_norm": 0.7703386551342313, + "learning_rate": 0.002064, + "loss": 4.7926, + "step": 688 + }, + { + "epoch": 0.00689, + "grad_norm": 0.6919162061146223, + "learning_rate": 0.002067, + "loss": 4.7965, + "step": 689 + }, + { + "epoch": 0.0069, + "grad_norm": 0.7424392154268248, + "learning_rate": 0.00207, + "loss": 4.7893, + "step": 690 + }, + { + "epoch": 0.00691, + "grad_norm": 0.6515618524352145, + "learning_rate": 0.0020729999999999998, + "loss": 4.7559, + "step": 691 + }, + { + "epoch": 0.00692, + "grad_norm": 0.6440846578393002, + "learning_rate": 0.0020759999999999997, + "loss": 4.776, + "step": 692 + }, + { + "epoch": 0.00693, + "grad_norm": 0.6847536481828279, + "learning_rate": 0.0020789999999999997, + "loss": 4.7889, + "step": 693 + }, + { + "epoch": 0.00694, + "grad_norm": 0.6321576161870056, + "learning_rate": 0.002082, + "loss": 4.7627, + "step": 694 + }, + { + "epoch": 0.00695, + "grad_norm": 0.5791129920715202, + "learning_rate": 0.002085, + "loss": 4.7609, + "step": 695 + }, + { + "epoch": 0.00696, + "grad_norm": 0.5895865438272808, + "learning_rate": 0.002088, + "loss": 4.7723, + "step": 696 + }, + { + "epoch": 0.00697, + "grad_norm": 0.5008187604770619, + "learning_rate": 0.002091, + "loss": 4.7695, + "step": 697 + }, + { + "epoch": 0.00698, + "grad_norm": 0.6970439697756265, + "learning_rate": 0.002094, + "loss": 4.7619, + "step": 698 + }, + { + "epoch": 0.00699, + "grad_norm": 0.8941704543265332, + "learning_rate": 0.002097, + "loss": 4.7572, + "step": 699 + }, + { + "epoch": 0.007, + "grad_norm": 0.9068627730041655, + "learning_rate": 0.0021, + "loss": 4.787, + "step": 700 + }, + { + "epoch": 0.00701, + "grad_norm": 0.7146483381512303, + "learning_rate": 0.002103, + "loss": 4.7547, + "step": 701 + }, + { + "epoch": 0.00702, + "grad_norm": 0.9172255209446268, + "learning_rate": 0.002106, + "loss": 4.77, + "step": 702 + }, + { + "epoch": 0.00703, + "grad_norm": 0.9047172643914575, + "learning_rate": 0.0021089999999999998, + "loss": 4.7553, + "step": 703 + }, + { + "epoch": 0.00704, + "grad_norm": 0.7853692419556185, + "learning_rate": 0.0021119999999999997, + "loss": 4.7583, + "step": 704 + }, + { + "epoch": 0.00705, + "grad_norm": 0.7199878385614988, + "learning_rate": 0.002115, + "loss": 4.7725, + "step": 705 + }, + { + "epoch": 0.00706, + "grad_norm": 0.7213393080579115, + "learning_rate": 0.002118, + "loss": 4.7581, + "step": 706 + }, + { + "epoch": 0.00707, + "grad_norm": 0.7597119331851468, + "learning_rate": 0.002121, + "loss": 4.7413, + "step": 707 + }, + { + "epoch": 0.00708, + "grad_norm": 0.6864102182118973, + "learning_rate": 0.002124, + "loss": 4.7187, + "step": 708 + }, + { + "epoch": 0.00709, + "grad_norm": 0.7815902187763394, + "learning_rate": 0.002127, + "loss": 4.7572, + "step": 709 + }, + { + "epoch": 0.0071, + "grad_norm": 0.8451784595752648, + "learning_rate": 0.00213, + "loss": 4.7552, + "step": 710 + }, + { + "epoch": 0.00711, + "grad_norm": 1.0054655399528605, + "learning_rate": 0.002133, + "loss": 4.7414, + "step": 711 + }, + { + "epoch": 0.00712, + "grad_norm": 0.9031323884556907, + "learning_rate": 0.002136, + "loss": 4.7728, + "step": 712 + }, + { + "epoch": 0.00713, + "grad_norm": 0.961250906275713, + "learning_rate": 0.002139, + "loss": 4.7862, + "step": 713 + }, + { + "epoch": 0.00714, + "grad_norm": 0.9556615314074448, + "learning_rate": 0.002142, + "loss": 4.7819, + "step": 714 + }, + { + "epoch": 0.00715, + "grad_norm": 0.837203607680531, + "learning_rate": 0.0021449999999999998, + "loss": 4.7417, + "step": 715 + }, + { + "epoch": 0.00716, + "grad_norm": 0.7607986282551458, + "learning_rate": 0.002148, + "loss": 4.7359, + "step": 716 + }, + { + "epoch": 0.00717, + "grad_norm": 0.8703365352693242, + "learning_rate": 0.002151, + "loss": 4.7519, + "step": 717 + }, + { + "epoch": 0.00718, + "grad_norm": 0.8830641357048177, + "learning_rate": 0.002154, + "loss": 4.7536, + "step": 718 + }, + { + "epoch": 0.00719, + "grad_norm": 0.8090298717986324, + "learning_rate": 0.002157, + "loss": 4.7586, + "step": 719 + }, + { + "epoch": 0.0072, + "grad_norm": 0.7002439324520396, + "learning_rate": 0.00216, + "loss": 4.7466, + "step": 720 + }, + { + "epoch": 0.00721, + "grad_norm": 0.7540412799334538, + "learning_rate": 0.002163, + "loss": 4.7512, + "step": 721 + }, + { + "epoch": 0.00722, + "grad_norm": 0.7234067697970273, + "learning_rate": 0.002166, + "loss": 4.7241, + "step": 722 + }, + { + "epoch": 0.00723, + "grad_norm": 0.5796869415275953, + "learning_rate": 0.002169, + "loss": 4.73, + "step": 723 + }, + { + "epoch": 0.00724, + "grad_norm": 0.6360613090935692, + "learning_rate": 0.002172, + "loss": 4.7294, + "step": 724 + }, + { + "epoch": 0.00725, + "grad_norm": 0.6592111108932344, + "learning_rate": 0.002175, + "loss": 4.7232, + "step": 725 + }, + { + "epoch": 0.00726, + "grad_norm": 0.7000176967246123, + "learning_rate": 0.002178, + "loss": 4.7406, + "step": 726 + }, + { + "epoch": 0.00727, + "grad_norm": 0.6658154130327723, + "learning_rate": 0.0021809999999999998, + "loss": 4.7131, + "step": 727 + }, + { + "epoch": 0.00728, + "grad_norm": 0.500886702178687, + "learning_rate": 0.002184, + "loss": 4.7222, + "step": 728 + }, + { + "epoch": 0.00729, + "grad_norm": 0.553445989931654, + "learning_rate": 0.002187, + "loss": 4.7196, + "step": 729 + }, + { + "epoch": 0.0073, + "grad_norm": 0.5928953773304845, + "learning_rate": 0.00219, + "loss": 4.7153, + "step": 730 + }, + { + "epoch": 0.00731, + "grad_norm": 0.5280339609019513, + "learning_rate": 0.002193, + "loss": 4.7069, + "step": 731 + }, + { + "epoch": 0.00732, + "grad_norm": 0.4601497488067425, + "learning_rate": 0.002196, + "loss": 4.7146, + "step": 732 + }, + { + "epoch": 0.00733, + "grad_norm": 0.4831437067076967, + "learning_rate": 0.002199, + "loss": 4.6865, + "step": 733 + }, + { + "epoch": 0.00734, + "grad_norm": 0.48957731222976764, + "learning_rate": 0.002202, + "loss": 4.7176, + "step": 734 + }, + { + "epoch": 0.00735, + "grad_norm": 0.5029506248084066, + "learning_rate": 0.002205, + "loss": 4.7231, + "step": 735 + }, + { + "epoch": 0.00736, + "grad_norm": 0.5300436466729722, + "learning_rate": 0.002208, + "loss": 4.7045, + "step": 736 + }, + { + "epoch": 0.00737, + "grad_norm": 0.5354857814520255, + "learning_rate": 0.002211, + "loss": 4.6701, + "step": 737 + }, + { + "epoch": 0.00738, + "grad_norm": 0.6855959285026678, + "learning_rate": 0.002214, + "loss": 4.6857, + "step": 738 + }, + { + "epoch": 0.00739, + "grad_norm": 0.7193696222416395, + "learning_rate": 0.0022170000000000002, + "loss": 4.6773, + "step": 739 + }, + { + "epoch": 0.0074, + "grad_norm": 0.7000843676029133, + "learning_rate": 0.00222, + "loss": 4.686, + "step": 740 + }, + { + "epoch": 0.00741, + "grad_norm": 0.8262482718120322, + "learning_rate": 0.002223, + "loss": 4.6648, + "step": 741 + }, + { + "epoch": 0.00742, + "grad_norm": 0.8068053565529363, + "learning_rate": 0.002226, + "loss": 4.71, + "step": 742 + }, + { + "epoch": 0.00743, + "grad_norm": 0.7713935209386231, + "learning_rate": 0.002229, + "loss": 4.6667, + "step": 743 + }, + { + "epoch": 0.00744, + "grad_norm": 0.6806090978340125, + "learning_rate": 0.002232, + "loss": 4.668, + "step": 744 + }, + { + "epoch": 0.00745, + "grad_norm": 0.8150134425373752, + "learning_rate": 0.002235, + "loss": 4.6906, + "step": 745 + }, + { + "epoch": 0.00746, + "grad_norm": 0.9083405480081935, + "learning_rate": 0.002238, + "loss": 4.6909, + "step": 746 + }, + { + "epoch": 0.00747, + "grad_norm": 1.1312224699232258, + "learning_rate": 0.002241, + "loss": 4.6956, + "step": 747 + }, + { + "epoch": 0.00748, + "grad_norm": 0.8174601291810354, + "learning_rate": 0.002244, + "loss": 4.6973, + "step": 748 + }, + { + "epoch": 0.00749, + "grad_norm": 0.8423282795209855, + "learning_rate": 0.002247, + "loss": 4.6802, + "step": 749 + }, + { + "epoch": 0.0075, + "grad_norm": 0.8679250685858194, + "learning_rate": 0.0022500000000000003, + "loss": 4.7268, + "step": 750 + }, + { + "epoch": 0.00751, + "grad_norm": 1.0939871221004271, + "learning_rate": 0.0022530000000000002, + "loss": 4.7337, + "step": 751 + }, + { + "epoch": 0.00752, + "grad_norm": 0.9886618564303525, + "learning_rate": 0.002256, + "loss": 4.6822, + "step": 752 + }, + { + "epoch": 0.00753, + "grad_norm": 0.9258452428585717, + "learning_rate": 0.002259, + "loss": 4.7192, + "step": 753 + }, + { + "epoch": 0.00754, + "grad_norm": 0.8790583060068752, + "learning_rate": 0.002262, + "loss": 4.7345, + "step": 754 + }, + { + "epoch": 0.00755, + "grad_norm": 0.7727162685258321, + "learning_rate": 0.002265, + "loss": 4.6919, + "step": 755 + }, + { + "epoch": 0.00756, + "grad_norm": 0.8048369196552551, + "learning_rate": 0.002268, + "loss": 4.6801, + "step": 756 + }, + { + "epoch": 0.00757, + "grad_norm": 0.7307749692176225, + "learning_rate": 0.002271, + "loss": 4.6902, + "step": 757 + }, + { + "epoch": 0.00758, + "grad_norm": 0.7628918458541498, + "learning_rate": 0.002274, + "loss": 4.6882, + "step": 758 + }, + { + "epoch": 0.00759, + "grad_norm": 0.6811469027490352, + "learning_rate": 0.002277, + "loss": 4.693, + "step": 759 + }, + { + "epoch": 0.0076, + "grad_norm": 0.5530412513377371, + "learning_rate": 0.00228, + "loss": 4.6735, + "step": 760 + }, + { + "epoch": 0.00761, + "grad_norm": 0.5221449888101848, + "learning_rate": 0.002283, + "loss": 4.6405, + "step": 761 + }, + { + "epoch": 0.00762, + "grad_norm": 0.5687089394846262, + "learning_rate": 0.0022860000000000003, + "loss": 4.6208, + "step": 762 + }, + { + "epoch": 0.00763, + "grad_norm": 0.5814285508645848, + "learning_rate": 0.0022890000000000002, + "loss": 4.6718, + "step": 763 + }, + { + "epoch": 0.00764, + "grad_norm": 0.6388540284979518, + "learning_rate": 0.002292, + "loss": 4.683, + "step": 764 + }, + { + "epoch": 0.00765, + "grad_norm": 0.7278589788698283, + "learning_rate": 0.002295, + "loss": 4.6752, + "step": 765 + }, + { + "epoch": 0.00766, + "grad_norm": 0.7050667087459527, + "learning_rate": 0.002298, + "loss": 4.6715, + "step": 766 + }, + { + "epoch": 0.00767, + "grad_norm": 0.6023307971425665, + "learning_rate": 0.002301, + "loss": 4.6623, + "step": 767 + }, + { + "epoch": 0.00768, + "grad_norm": 0.7162182495110988, + "learning_rate": 0.002304, + "loss": 4.6687, + "step": 768 + }, + { + "epoch": 0.00769, + "grad_norm": 0.8754398583131885, + "learning_rate": 0.002307, + "loss": 4.6855, + "step": 769 + }, + { + "epoch": 0.0077, + "grad_norm": 0.9282950433205286, + "learning_rate": 0.00231, + "loss": 4.656, + "step": 770 + }, + { + "epoch": 0.00771, + "grad_norm": 0.8826983762517153, + "learning_rate": 0.002313, + "loss": 4.6927, + "step": 771 + }, + { + "epoch": 0.00772, + "grad_norm": 0.7955428299875815, + "learning_rate": 0.002316, + "loss": 4.6752, + "step": 772 + }, + { + "epoch": 0.00773, + "grad_norm": 0.6879017191297421, + "learning_rate": 0.0023190000000000003, + "loss": 4.6732, + "step": 773 + }, + { + "epoch": 0.00774, + "grad_norm": 0.5805904836674535, + "learning_rate": 0.0023220000000000003, + "loss": 4.6842, + "step": 774 + }, + { + "epoch": 0.00775, + "grad_norm": 0.5872309146101224, + "learning_rate": 0.0023250000000000002, + "loss": 4.6741, + "step": 775 + }, + { + "epoch": 0.00776, + "grad_norm": 0.6663800275805344, + "learning_rate": 0.002328, + "loss": 4.6762, + "step": 776 + }, + { + "epoch": 0.00777, + "grad_norm": 0.6475349218207965, + "learning_rate": 0.002331, + "loss": 4.6499, + "step": 777 + }, + { + "epoch": 0.00778, + "grad_norm": 0.6498373909650491, + "learning_rate": 0.002334, + "loss": 4.6643, + "step": 778 + }, + { + "epoch": 0.00779, + "grad_norm": 0.6357690952406082, + "learning_rate": 0.002337, + "loss": 4.6181, + "step": 779 + }, + { + "epoch": 0.0078, + "grad_norm": 0.6241849680287349, + "learning_rate": 0.00234, + "loss": 4.6419, + "step": 780 + }, + { + "epoch": 0.00781, + "grad_norm": 0.6808062655697796, + "learning_rate": 0.002343, + "loss": 4.6451, + "step": 781 + }, + { + "epoch": 0.00782, + "grad_norm": 0.7065267585011001, + "learning_rate": 0.002346, + "loss": 4.6436, + "step": 782 + }, + { + "epoch": 0.00783, + "grad_norm": 0.6381701638777938, + "learning_rate": 0.002349, + "loss": 4.6242, + "step": 783 + }, + { + "epoch": 0.00784, + "grad_norm": 0.647841330234918, + "learning_rate": 0.002352, + "loss": 4.6355, + "step": 784 + }, + { + "epoch": 0.00785, + "grad_norm": 0.60562299847623, + "learning_rate": 0.0023550000000000003, + "loss": 4.6345, + "step": 785 + }, + { + "epoch": 0.00786, + "grad_norm": 0.5148036449557751, + "learning_rate": 0.0023580000000000003, + "loss": 4.628, + "step": 786 + }, + { + "epoch": 0.00787, + "grad_norm": 0.7217903580979332, + "learning_rate": 0.0023610000000000003, + "loss": 4.641, + "step": 787 + }, + { + "epoch": 0.00788, + "grad_norm": 0.8515178253715305, + "learning_rate": 0.002364, + "loss": 4.6466, + "step": 788 + }, + { + "epoch": 0.00789, + "grad_norm": 0.8831947515557061, + "learning_rate": 0.002367, + "loss": 4.6753, + "step": 789 + }, + { + "epoch": 0.0079, + "grad_norm": 0.9816312249435458, + "learning_rate": 0.00237, + "loss": 4.6574, + "step": 790 + }, + { + "epoch": 0.00791, + "grad_norm": 0.8257428278422617, + "learning_rate": 0.002373, + "loss": 4.6081, + "step": 791 + }, + { + "epoch": 0.00792, + "grad_norm": 0.6889392893349975, + "learning_rate": 0.002376, + "loss": 4.6372, + "step": 792 + }, + { + "epoch": 0.00793, + "grad_norm": 0.5470340397913868, + "learning_rate": 0.002379, + "loss": 4.6272, + "step": 793 + }, + { + "epoch": 0.00794, + "grad_norm": 0.5333909794818302, + "learning_rate": 0.002382, + "loss": 4.6359, + "step": 794 + }, + { + "epoch": 0.00795, + "grad_norm": 0.47330116805918854, + "learning_rate": 0.002385, + "loss": 4.6105, + "step": 795 + }, + { + "epoch": 0.00796, + "grad_norm": 0.46996625387544017, + "learning_rate": 0.0023880000000000004, + "loss": 4.6049, + "step": 796 + }, + { + "epoch": 0.00797, + "grad_norm": 0.4648603328586337, + "learning_rate": 0.0023910000000000003, + "loss": 4.6461, + "step": 797 + }, + { + "epoch": 0.00798, + "grad_norm": 0.48147045136320854, + "learning_rate": 0.0023940000000000003, + "loss": 4.6125, + "step": 798 + }, + { + "epoch": 0.00799, + "grad_norm": 0.5520002928611956, + "learning_rate": 0.0023970000000000003, + "loss": 4.6461, + "step": 799 + }, + { + "epoch": 0.008, + "grad_norm": 0.49520453644625784, + "learning_rate": 0.0024000000000000002, + "loss": 4.5958, + "step": 800 + }, + { + "epoch": 0.00801, + "grad_norm": 0.4617883447215667, + "learning_rate": 0.002403, + "loss": 4.6244, + "step": 801 + }, + { + "epoch": 0.00802, + "grad_norm": 0.6171622349156032, + "learning_rate": 0.002406, + "loss": 4.6206, + "step": 802 + }, + { + "epoch": 0.00803, + "grad_norm": 0.7819651777797723, + "learning_rate": 0.002409, + "loss": 4.605, + "step": 803 + }, + { + "epoch": 0.00804, + "grad_norm": 0.8072878053268496, + "learning_rate": 0.002412, + "loss": 4.6195, + "step": 804 + }, + { + "epoch": 0.00805, + "grad_norm": 0.6878812715646375, + "learning_rate": 0.002415, + "loss": 4.6081, + "step": 805 + }, + { + "epoch": 0.00806, + "grad_norm": 0.7628005366591507, + "learning_rate": 0.002418, + "loss": 4.6308, + "step": 806 + }, + { + "epoch": 0.00807, + "grad_norm": 0.7958527408861041, + "learning_rate": 0.0024210000000000004, + "loss": 4.6228, + "step": 807 + }, + { + "epoch": 0.00808, + "grad_norm": 0.7899389450719584, + "learning_rate": 0.0024240000000000004, + "loss": 4.6083, + "step": 808 + }, + { + "epoch": 0.00809, + "grad_norm": 1.193539574978161, + "learning_rate": 0.0024270000000000003, + "loss": 4.6201, + "step": 809 + }, + { + "epoch": 0.0081, + "grad_norm": 1.0067442084818319, + "learning_rate": 0.0024300000000000003, + "loss": 4.6554, + "step": 810 + }, + { + "epoch": 0.00811, + "grad_norm": 1.0302946243785736, + "learning_rate": 0.0024330000000000003, + "loss": 4.6338, + "step": 811 + }, + { + "epoch": 0.00812, + "grad_norm": 0.7966936317025509, + "learning_rate": 0.0024360000000000002, + "loss": 4.6044, + "step": 812 + }, + { + "epoch": 0.00813, + "grad_norm": 0.7843512654561826, + "learning_rate": 0.0024389999999999998, + "loss": 4.6449, + "step": 813 + }, + { + "epoch": 0.00814, + "grad_norm": 0.7981724618057067, + "learning_rate": 0.0024419999999999997, + "loss": 4.6482, + "step": 814 + }, + { + "epoch": 0.00815, + "grad_norm": 0.8382113380987876, + "learning_rate": 0.0024449999999999997, + "loss": 4.6588, + "step": 815 + }, + { + "epoch": 0.00816, + "grad_norm": 0.8204581966398267, + "learning_rate": 0.002448, + "loss": 4.6311, + "step": 816 + }, + { + "epoch": 0.00817, + "grad_norm": 1.02336938073518, + "learning_rate": 0.002451, + "loss": 4.6583, + "step": 817 + }, + { + "epoch": 0.00818, + "grad_norm": 0.815478540285485, + "learning_rate": 0.002454, + "loss": 4.6309, + "step": 818 + }, + { + "epoch": 0.00819, + "grad_norm": 0.9108711148393207, + "learning_rate": 0.002457, + "loss": 4.6483, + "step": 819 + }, + { + "epoch": 0.0082, + "grad_norm": 0.8773364113378127, + "learning_rate": 0.00246, + "loss": 4.6373, + "step": 820 + }, + { + "epoch": 0.00821, + "grad_norm": 0.8093045902813614, + "learning_rate": 0.002463, + "loss": 4.6126, + "step": 821 + }, + { + "epoch": 0.00822, + "grad_norm": 0.8775561270633004, + "learning_rate": 0.002466, + "loss": 4.6164, + "step": 822 + }, + { + "epoch": 0.00823, + "grad_norm": 0.8916321669966187, + "learning_rate": 0.002469, + "loss": 4.6298, + "step": 823 + }, + { + "epoch": 0.00824, + "grad_norm": 0.7939051533904264, + "learning_rate": 0.002472, + "loss": 4.6511, + "step": 824 + }, + { + "epoch": 0.00825, + "grad_norm": 0.9509206049767348, + "learning_rate": 0.0024749999999999998, + "loss": 4.6625, + "step": 825 + }, + { + "epoch": 0.00826, + "grad_norm": 1.0914612729506281, + "learning_rate": 0.0024779999999999997, + "loss": 4.6356, + "step": 826 + }, + { + "epoch": 0.00827, + "grad_norm": 1.0440371247088225, + "learning_rate": 0.002481, + "loss": 4.6709, + "step": 827 + }, + { + "epoch": 0.00828, + "grad_norm": 1.0292672719671891, + "learning_rate": 0.002484, + "loss": 4.6644, + "step": 828 + }, + { + "epoch": 0.00829, + "grad_norm": 1.002549149572049, + "learning_rate": 0.002487, + "loss": 4.6608, + "step": 829 + }, + { + "epoch": 0.0083, + "grad_norm": 0.8229445260626227, + "learning_rate": 0.00249, + "loss": 4.6642, + "step": 830 + }, + { + "epoch": 0.00831, + "grad_norm": 0.7223475739297199, + "learning_rate": 0.002493, + "loss": 4.6379, + "step": 831 + }, + { + "epoch": 0.00832, + "grad_norm": 0.6422365780456449, + "learning_rate": 0.002496, + "loss": 4.6349, + "step": 832 + }, + { + "epoch": 0.00833, + "grad_norm": 0.6433276699815419, + "learning_rate": 0.002499, + "loss": 4.626, + "step": 833 + }, + { + "epoch": 0.00834, + "grad_norm": 0.6136587911860008, + "learning_rate": 0.002502, + "loss": 4.6284, + "step": 834 + }, + { + "epoch": 0.00835, + "grad_norm": 0.5892258230535582, + "learning_rate": 0.002505, + "loss": 4.617, + "step": 835 + }, + { + "epoch": 0.00836, + "grad_norm": 0.5407454758774727, + "learning_rate": 0.002508, + "loss": 4.6416, + "step": 836 + }, + { + "epoch": 0.00837, + "grad_norm": 0.5840603338652609, + "learning_rate": 0.0025109999999999998, + "loss": 4.582, + "step": 837 + }, + { + "epoch": 0.00838, + "grad_norm": 0.5192725472759927, + "learning_rate": 0.0025139999999999997, + "loss": 4.6102, + "step": 838 + }, + { + "epoch": 0.00839, + "grad_norm": 0.5064380785759203, + "learning_rate": 0.002517, + "loss": 4.6034, + "step": 839 + }, + { + "epoch": 0.0084, + "grad_norm": 0.48476724687493267, + "learning_rate": 0.00252, + "loss": 4.5854, + "step": 840 + }, + { + "epoch": 0.00841, + "grad_norm": 0.44496684540968734, + "learning_rate": 0.002523, + "loss": 4.5786, + "step": 841 + }, + { + "epoch": 0.00842, + "grad_norm": 0.42609007840223895, + "learning_rate": 0.002526, + "loss": 4.5771, + "step": 842 + }, + { + "epoch": 0.00843, + "grad_norm": 0.4703393687667864, + "learning_rate": 0.002529, + "loss": 4.5751, + "step": 843 + }, + { + "epoch": 0.00844, + "grad_norm": 0.4976619641323943, + "learning_rate": 0.002532, + "loss": 4.5956, + "step": 844 + }, + { + "epoch": 0.00845, + "grad_norm": 0.530881004052944, + "learning_rate": 0.002535, + "loss": 4.5625, + "step": 845 + }, + { + "epoch": 0.00846, + "grad_norm": 0.5069253535552343, + "learning_rate": 0.002538, + "loss": 4.584, + "step": 846 + }, + { + "epoch": 0.00847, + "grad_norm": 0.4900054637856495, + "learning_rate": 0.002541, + "loss": 4.5637, + "step": 847 + }, + { + "epoch": 0.00848, + "grad_norm": 0.5361511355183629, + "learning_rate": 0.002544, + "loss": 4.5693, + "step": 848 + }, + { + "epoch": 0.00849, + "grad_norm": 0.6067359238432654, + "learning_rate": 0.002547, + "loss": 4.5644, + "step": 849 + }, + { + "epoch": 0.0085, + "grad_norm": 0.5519192846207763, + "learning_rate": 0.00255, + "loss": 4.573, + "step": 850 + }, + { + "epoch": 0.00851, + "grad_norm": 0.46694877598438245, + "learning_rate": 0.002553, + "loss": 4.5875, + "step": 851 + }, + { + "epoch": 0.00852, + "grad_norm": 0.477565098915178, + "learning_rate": 0.002556, + "loss": 4.5765, + "step": 852 + }, + { + "epoch": 0.00853, + "grad_norm": 0.5020213435824815, + "learning_rate": 0.002559, + "loss": 4.5556, + "step": 853 + }, + { + "epoch": 0.00854, + "grad_norm": 0.5171409161048013, + "learning_rate": 0.002562, + "loss": 4.5495, + "step": 854 + }, + { + "epoch": 0.00855, + "grad_norm": 0.46627459343076927, + "learning_rate": 0.002565, + "loss": 4.5252, + "step": 855 + }, + { + "epoch": 0.00856, + "grad_norm": 0.5139521756940325, + "learning_rate": 0.002568, + "loss": 4.5623, + "step": 856 + }, + { + "epoch": 0.00857, + "grad_norm": 0.6011403998041547, + "learning_rate": 0.002571, + "loss": 4.5577, + "step": 857 + }, + { + "epoch": 0.00858, + "grad_norm": 0.554768384377006, + "learning_rate": 0.002574, + "loss": 4.5487, + "step": 858 + }, + { + "epoch": 0.00859, + "grad_norm": 0.539858542755145, + "learning_rate": 0.002577, + "loss": 4.5383, + "step": 859 + }, + { + "epoch": 0.0086, + "grad_norm": 0.586599307397693, + "learning_rate": 0.00258, + "loss": 4.571, + "step": 860 + }, + { + "epoch": 0.00861, + "grad_norm": 0.7154666400015554, + "learning_rate": 0.0025830000000000002, + "loss": 4.5928, + "step": 861 + }, + { + "epoch": 0.00862, + "grad_norm": 0.7345971792604707, + "learning_rate": 0.002586, + "loss": 4.5402, + "step": 862 + }, + { + "epoch": 0.00863, + "grad_norm": 0.7491760821516434, + "learning_rate": 0.002589, + "loss": 4.5765, + "step": 863 + }, + { + "epoch": 0.00864, + "grad_norm": 0.9255705361922033, + "learning_rate": 0.002592, + "loss": 4.5322, + "step": 864 + }, + { + "epoch": 0.00865, + "grad_norm": 0.9964986146275199, + "learning_rate": 0.002595, + "loss": 4.5664, + "step": 865 + }, + { + "epoch": 0.00866, + "grad_norm": 0.7618488122087141, + "learning_rate": 0.002598, + "loss": 4.5836, + "step": 866 + }, + { + "epoch": 0.00867, + "grad_norm": 0.8524780083566116, + "learning_rate": 0.002601, + "loss": 4.5742, + "step": 867 + }, + { + "epoch": 0.00868, + "grad_norm": 0.9692981312410378, + "learning_rate": 0.002604, + "loss": 4.5808, + "step": 868 + }, + { + "epoch": 0.00869, + "grad_norm": 1.1822252043975705, + "learning_rate": 0.002607, + "loss": 4.6073, + "step": 869 + }, + { + "epoch": 0.0087, + "grad_norm": 0.9057663759386707, + "learning_rate": 0.00261, + "loss": 4.5844, + "step": 870 + }, + { + "epoch": 0.00871, + "grad_norm": 0.9457338978675252, + "learning_rate": 0.002613, + "loss": 4.6115, + "step": 871 + }, + { + "epoch": 0.00872, + "grad_norm": 0.9845348105394848, + "learning_rate": 0.002616, + "loss": 4.5975, + "step": 872 + }, + { + "epoch": 0.00873, + "grad_norm": 0.8202179076205192, + "learning_rate": 0.0026190000000000002, + "loss": 4.5967, + "step": 873 + }, + { + "epoch": 0.00874, + "grad_norm": 0.6587988147688274, + "learning_rate": 0.002622, + "loss": 4.6142, + "step": 874 + }, + { + "epoch": 0.00875, + "grad_norm": 0.6312495265838277, + "learning_rate": 0.002625, + "loss": 4.5549, + "step": 875 + }, + { + "epoch": 0.00876, + "grad_norm": 0.6646817876274769, + "learning_rate": 0.002628, + "loss": 4.5838, + "step": 876 + }, + { + "epoch": 0.00877, + "grad_norm": 0.632354886157607, + "learning_rate": 0.002631, + "loss": 4.6011, + "step": 877 + }, + { + "epoch": 0.00878, + "grad_norm": 0.5630676163174572, + "learning_rate": 0.002634, + "loss": 4.5288, + "step": 878 + }, + { + "epoch": 0.00879, + "grad_norm": 0.4918531988998375, + "learning_rate": 0.002637, + "loss": 4.5559, + "step": 879 + }, + { + "epoch": 0.0088, + "grad_norm": 0.42476181684324305, + "learning_rate": 0.00264, + "loss": 4.5634, + "step": 880 + }, + { + "epoch": 0.00881, + "grad_norm": 0.4573573466722849, + "learning_rate": 0.002643, + "loss": 4.5644, + "step": 881 + }, + { + "epoch": 0.00882, + "grad_norm": 0.5881448674370812, + "learning_rate": 0.002646, + "loss": 4.5659, + "step": 882 + }, + { + "epoch": 0.00883, + "grad_norm": 0.7764456560266775, + "learning_rate": 0.002649, + "loss": 4.5645, + "step": 883 + }, + { + "epoch": 0.00884, + "grad_norm": 0.9651176541039754, + "learning_rate": 0.0026520000000000003, + "loss": 4.5629, + "step": 884 + }, + { + "epoch": 0.00885, + "grad_norm": 1.0453630458113787, + "learning_rate": 0.0026550000000000002, + "loss": 4.5675, + "step": 885 + }, + { + "epoch": 0.00886, + "grad_norm": 0.8249295551343052, + "learning_rate": 0.002658, + "loss": 4.5811, + "step": 886 + }, + { + "epoch": 0.00887, + "grad_norm": 0.6632170677276661, + "learning_rate": 0.002661, + "loss": 4.5643, + "step": 887 + }, + { + "epoch": 0.00888, + "grad_norm": 0.7818922566742896, + "learning_rate": 0.002664, + "loss": 4.5709, + "step": 888 + }, + { + "epoch": 0.00889, + "grad_norm": 0.8131057291041344, + "learning_rate": 0.002667, + "loss": 4.5276, + "step": 889 + }, + { + "epoch": 0.0089, + "grad_norm": 0.7364786352062309, + "learning_rate": 0.00267, + "loss": 4.5735, + "step": 890 + }, + { + "epoch": 0.00891, + "grad_norm": 0.6174568923006037, + "learning_rate": 0.002673, + "loss": 4.5538, + "step": 891 + }, + { + "epoch": 0.00892, + "grad_norm": 0.6060396654742667, + "learning_rate": 0.002676, + "loss": 4.5678, + "step": 892 + }, + { + "epoch": 0.00893, + "grad_norm": 0.6503337239639668, + "learning_rate": 0.002679, + "loss": 4.5453, + "step": 893 + }, + { + "epoch": 0.00894, + "grad_norm": 0.6599395002026207, + "learning_rate": 0.002682, + "loss": 4.5291, + "step": 894 + }, + { + "epoch": 0.00895, + "grad_norm": 0.5989877186645693, + "learning_rate": 0.0026850000000000003, + "loss": 4.5412, + "step": 895 + }, + { + "epoch": 0.00896, + "grad_norm": 0.5286031214975206, + "learning_rate": 0.0026880000000000003, + "loss": 4.5273, + "step": 896 + }, + { + "epoch": 0.00897, + "grad_norm": 0.6246596167729576, + "learning_rate": 0.0026910000000000002, + "loss": 4.5504, + "step": 897 + }, + { + "epoch": 0.00898, + "grad_norm": 0.6886920087577523, + "learning_rate": 0.002694, + "loss": 4.5437, + "step": 898 + }, + { + "epoch": 0.00899, + "grad_norm": 0.7603324493631337, + "learning_rate": 0.002697, + "loss": 4.5543, + "step": 899 + }, + { + "epoch": 0.009, + "grad_norm": 0.7773743953648492, + "learning_rate": 0.0027, + "loss": 4.5794, + "step": 900 + }, + { + "epoch": 0.00901, + "grad_norm": 0.683256197441996, + "learning_rate": 0.002703, + "loss": 4.5307, + "step": 901 + }, + { + "epoch": 0.00902, + "grad_norm": 0.5681357763332335, + "learning_rate": 0.002706, + "loss": 4.5356, + "step": 902 + }, + { + "epoch": 0.00903, + "grad_norm": 0.5420591540444755, + "learning_rate": 0.002709, + "loss": 4.5338, + "step": 903 + }, + { + "epoch": 0.00904, + "grad_norm": 0.5224631659490503, + "learning_rate": 0.002712, + "loss": 4.5093, + "step": 904 + }, + { + "epoch": 0.00905, + "grad_norm": 0.5026034590467293, + "learning_rate": 0.002715, + "loss": 4.5252, + "step": 905 + }, + { + "epoch": 0.00906, + "grad_norm": 0.5177890071237494, + "learning_rate": 0.002718, + "loss": 4.5378, + "step": 906 + }, + { + "epoch": 0.00907, + "grad_norm": 0.5764689015080159, + "learning_rate": 0.0027210000000000003, + "loss": 4.5536, + "step": 907 + }, + { + "epoch": 0.00908, + "grad_norm": 0.6259624722487185, + "learning_rate": 0.0027240000000000003, + "loss": 4.5265, + "step": 908 + }, + { + "epoch": 0.00909, + "grad_norm": 0.670091172363038, + "learning_rate": 0.0027270000000000003, + "loss": 4.5481, + "step": 909 + }, + { + "epoch": 0.0091, + "grad_norm": 0.7211417475777565, + "learning_rate": 0.0027300000000000002, + "loss": 4.555, + "step": 910 + }, + { + "epoch": 0.00911, + "grad_norm": 0.6734826041799787, + "learning_rate": 0.002733, + "loss": 4.5118, + "step": 911 + }, + { + "epoch": 0.00912, + "grad_norm": 0.5721394003951694, + "learning_rate": 0.002736, + "loss": 4.5134, + "step": 912 + }, + { + "epoch": 0.00913, + "grad_norm": 0.5576363751689392, + "learning_rate": 0.002739, + "loss": 4.5062, + "step": 913 + }, + { + "epoch": 0.00914, + "grad_norm": 0.659556953854551, + "learning_rate": 0.002742, + "loss": 4.5426, + "step": 914 + }, + { + "epoch": 0.00915, + "grad_norm": 0.7056386423863461, + "learning_rate": 0.002745, + "loss": 4.5041, + "step": 915 + }, + { + "epoch": 0.00916, + "grad_norm": 0.6615396800381155, + "learning_rate": 0.002748, + "loss": 4.4829, + "step": 916 + }, + { + "epoch": 0.00917, + "grad_norm": 0.6203895946828626, + "learning_rate": 0.002751, + "loss": 4.4952, + "step": 917 + }, + { + "epoch": 0.00918, + "grad_norm": 0.6455523293432982, + "learning_rate": 0.0027540000000000004, + "loss": 4.5159, + "step": 918 + }, + { + "epoch": 0.00919, + "grad_norm": 0.6153975854015812, + "learning_rate": 0.0027570000000000003, + "loss": 4.4956, + "step": 919 + }, + { + "epoch": 0.0092, + "grad_norm": 0.6747074236078813, + "learning_rate": 0.0027600000000000003, + "loss": 4.516, + "step": 920 + }, + { + "epoch": 0.00921, + "grad_norm": 0.7525938258933302, + "learning_rate": 0.0027630000000000003, + "loss": 4.5035, + "step": 921 + }, + { + "epoch": 0.00922, + "grad_norm": 0.6613679668687953, + "learning_rate": 0.0027660000000000002, + "loss": 4.5161, + "step": 922 + }, + { + "epoch": 0.00923, + "grad_norm": 0.657751627761755, + "learning_rate": 0.002769, + "loss": 4.4882, + "step": 923 + }, + { + "epoch": 0.00924, + "grad_norm": 0.6747361910557791, + "learning_rate": 0.002772, + "loss": 4.515, + "step": 924 + }, + { + "epoch": 0.00925, + "grad_norm": 0.6239646474738244, + "learning_rate": 0.002775, + "loss": 4.4929, + "step": 925 + }, + { + "epoch": 0.00926, + "grad_norm": 0.599067117804374, + "learning_rate": 0.002778, + "loss": 4.5097, + "step": 926 + }, + { + "epoch": 0.00927, + "grad_norm": 0.5594951339370652, + "learning_rate": 0.002781, + "loss": 4.4719, + "step": 927 + }, + { + "epoch": 0.00928, + "grad_norm": 0.6063845847981192, + "learning_rate": 0.002784, + "loss": 4.518, + "step": 928 + }, + { + "epoch": 0.00929, + "grad_norm": 0.6363503981385901, + "learning_rate": 0.0027870000000000004, + "loss": 4.5031, + "step": 929 + }, + { + "epoch": 0.0093, + "grad_norm": 0.7396258221206569, + "learning_rate": 0.0027900000000000004, + "loss": 4.4944, + "step": 930 + }, + { + "epoch": 0.00931, + "grad_norm": 0.8942550404249334, + "learning_rate": 0.0027930000000000003, + "loss": 4.517, + "step": 931 + }, + { + "epoch": 0.00932, + "grad_norm": 1.0354660733966428, + "learning_rate": 0.0027960000000000003, + "loss": 4.5402, + "step": 932 + }, + { + "epoch": 0.00933, + "grad_norm": 1.180367237398422, + "learning_rate": 0.0027990000000000003, + "loss": 4.525, + "step": 933 + }, + { + "epoch": 0.00934, + "grad_norm": 0.9502642927196222, + "learning_rate": 0.0028020000000000002, + "loss": 4.5541, + "step": 934 + }, + { + "epoch": 0.00935, + "grad_norm": 0.8858808588378486, + "learning_rate": 0.002805, + "loss": 4.5601, + "step": 935 + }, + { + "epoch": 0.00936, + "grad_norm": 0.9244572615029755, + "learning_rate": 0.002808, + "loss": 4.569, + "step": 936 + }, + { + "epoch": 0.00937, + "grad_norm": 0.9225067704838915, + "learning_rate": 0.002811, + "loss": 4.5806, + "step": 937 + }, + { + "epoch": 0.00938, + "grad_norm": 0.8534895885659719, + "learning_rate": 0.002814, + "loss": 4.5604, + "step": 938 + }, + { + "epoch": 0.00939, + "grad_norm": 0.9046137087836131, + "learning_rate": 0.002817, + "loss": 4.5554, + "step": 939 + }, + { + "epoch": 0.0094, + "grad_norm": 0.7476401969651744, + "learning_rate": 0.00282, + "loss": 4.5482, + "step": 940 + }, + { + "epoch": 0.00941, + "grad_norm": 0.7066455268972154, + "learning_rate": 0.002823, + "loss": 4.5225, + "step": 941 + }, + { + "epoch": 0.00942, + "grad_norm": 0.6459119835251312, + "learning_rate": 0.002826, + "loss": 4.544, + "step": 942 + }, + { + "epoch": 0.00943, + "grad_norm": 0.6055625869260791, + "learning_rate": 0.002829, + "loss": 4.5363, + "step": 943 + }, + { + "epoch": 0.00944, + "grad_norm": 0.5293280202161804, + "learning_rate": 0.002832, + "loss": 4.5307, + "step": 944 + }, + { + "epoch": 0.00945, + "grad_norm": 0.5438831110130244, + "learning_rate": 0.002835, + "loss": 4.5334, + "step": 945 + }, + { + "epoch": 0.00946, + "grad_norm": 0.4766420743521973, + "learning_rate": 0.002838, + "loss": 4.5433, + "step": 946 + }, + { + "epoch": 0.00947, + "grad_norm": 0.40195567884756706, + "learning_rate": 0.0028409999999999998, + "loss": 4.5081, + "step": 947 + }, + { + "epoch": 0.00948, + "grad_norm": 0.3783844921089427, + "learning_rate": 0.0028439999999999997, + "loss": 4.4865, + "step": 948 + }, + { + "epoch": 0.00949, + "grad_norm": 0.4197576569837563, + "learning_rate": 0.002847, + "loss": 4.5306, + "step": 949 + }, + { + "epoch": 0.0095, + "grad_norm": 0.45947372263331304, + "learning_rate": 0.00285, + "loss": 4.5193, + "step": 950 + }, + { + "epoch": 0.00951, + "grad_norm": 0.5187245758366383, + "learning_rate": 0.002853, + "loss": 4.4969, + "step": 951 + }, + { + "epoch": 0.00952, + "grad_norm": 0.5515692080168162, + "learning_rate": 0.002856, + "loss": 4.5218, + "step": 952 + }, + { + "epoch": 0.00953, + "grad_norm": 0.501582875002041, + "learning_rate": 0.002859, + "loss": 4.4606, + "step": 953 + }, + { + "epoch": 0.00954, + "grad_norm": 0.5014106294436917, + "learning_rate": 0.002862, + "loss": 4.5197, + "step": 954 + }, + { + "epoch": 0.00955, + "grad_norm": 0.6047606934565909, + "learning_rate": 0.002865, + "loss": 4.5086, + "step": 955 + }, + { + "epoch": 0.00956, + "grad_norm": 0.6661868633369662, + "learning_rate": 0.002868, + "loss": 4.4921, + "step": 956 + }, + { + "epoch": 0.00957, + "grad_norm": 0.6511713371124522, + "learning_rate": 0.002871, + "loss": 4.514, + "step": 957 + }, + { + "epoch": 0.00958, + "grad_norm": 0.5733443203887492, + "learning_rate": 0.002874, + "loss": 4.4931, + "step": 958 + }, + { + "epoch": 0.00959, + "grad_norm": 0.6024952806359369, + "learning_rate": 0.002877, + "loss": 4.4895, + "step": 959 + }, + { + "epoch": 0.0096, + "grad_norm": 0.6029559818977924, + "learning_rate": 0.0028799999999999997, + "loss": 4.4868, + "step": 960 + }, + { + "epoch": 0.00961, + "grad_norm": 0.5721073369283843, + "learning_rate": 0.002883, + "loss": 4.4604, + "step": 961 + }, + { + "epoch": 0.00962, + "grad_norm": 0.5737900491823522, + "learning_rate": 0.002886, + "loss": 4.4898, + "step": 962 + }, + { + "epoch": 0.00963, + "grad_norm": 0.5323481251626608, + "learning_rate": 0.002889, + "loss": 4.4867, + "step": 963 + }, + { + "epoch": 0.00964, + "grad_norm": 0.5436801325002781, + "learning_rate": 0.002892, + "loss": 4.4807, + "step": 964 + }, + { + "epoch": 0.00965, + "grad_norm": 0.60229729083351, + "learning_rate": 0.002895, + "loss": 4.4568, + "step": 965 + }, + { + "epoch": 0.00966, + "grad_norm": 0.6629818387101306, + "learning_rate": 0.002898, + "loss": 4.4766, + "step": 966 + }, + { + "epoch": 0.00967, + "grad_norm": 0.6748155978904155, + "learning_rate": 0.002901, + "loss": 4.5156, + "step": 967 + }, + { + "epoch": 0.00968, + "grad_norm": 0.7427494012599226, + "learning_rate": 0.002904, + "loss": 4.4866, + "step": 968 + }, + { + "epoch": 0.00969, + "grad_norm": 0.8794961931178971, + "learning_rate": 0.002907, + "loss": 4.5273, + "step": 969 + }, + { + "epoch": 0.0097, + "grad_norm": 0.8586008854691127, + "learning_rate": 0.00291, + "loss": 4.4896, + "step": 970 + }, + { + "epoch": 0.00971, + "grad_norm": 0.8273563438543869, + "learning_rate": 0.002913, + "loss": 4.4955, + "step": 971 + }, + { + "epoch": 0.00972, + "grad_norm": 0.7536097688784559, + "learning_rate": 0.002916, + "loss": 4.5029, + "step": 972 + }, + { + "epoch": 0.00973, + "grad_norm": 0.7541845251322323, + "learning_rate": 0.002919, + "loss": 4.4985, + "step": 973 + }, + { + "epoch": 0.00974, + "grad_norm": 0.6473607436694337, + "learning_rate": 0.002922, + "loss": 4.4796, + "step": 974 + }, + { + "epoch": 0.00975, + "grad_norm": 0.7361402706574074, + "learning_rate": 0.002925, + "loss": 4.5044, + "step": 975 + }, + { + "epoch": 0.00976, + "grad_norm": 0.868936228763895, + "learning_rate": 0.002928, + "loss": 4.508, + "step": 976 + }, + { + "epoch": 0.00977, + "grad_norm": 0.8813019375073942, + "learning_rate": 0.002931, + "loss": 4.5408, + "step": 977 + }, + { + "epoch": 0.00978, + "grad_norm": 0.9426880168937273, + "learning_rate": 0.002934, + "loss": 4.5239, + "step": 978 + }, + { + "epoch": 0.00979, + "grad_norm": 0.8776049562434768, + "learning_rate": 0.002937, + "loss": 4.5177, + "step": 979 + }, + { + "epoch": 0.0098, + "grad_norm": 0.7621714209410982, + "learning_rate": 0.00294, + "loss": 4.5005, + "step": 980 + }, + { + "epoch": 0.00981, + "grad_norm": 0.7607321859563556, + "learning_rate": 0.002943, + "loss": 4.5183, + "step": 981 + }, + { + "epoch": 0.00982, + "grad_norm": 0.8148690145722087, + "learning_rate": 0.002946, + "loss": 4.5101, + "step": 982 + }, + { + "epoch": 0.00983, + "grad_norm": 0.8602879224908239, + "learning_rate": 0.0029490000000000002, + "loss": 4.5186, + "step": 983 + }, + { + "epoch": 0.00984, + "grad_norm": 0.9348586711938943, + "learning_rate": 0.002952, + "loss": 4.5105, + "step": 984 + }, + { + "epoch": 0.00985, + "grad_norm": 0.9010607870226212, + "learning_rate": 0.002955, + "loss": 4.5341, + "step": 985 + }, + { + "epoch": 0.00986, + "grad_norm": 0.8225410280635316, + "learning_rate": 0.002958, + "loss": 4.497, + "step": 986 + }, + { + "epoch": 0.00987, + "grad_norm": 0.820560994458863, + "learning_rate": 0.002961, + "loss": 4.5111, + "step": 987 + }, + { + "epoch": 0.00988, + "grad_norm": 0.7430257271274537, + "learning_rate": 0.002964, + "loss": 4.5437, + "step": 988 + }, + { + "epoch": 0.00989, + "grad_norm": 0.7193873820034543, + "learning_rate": 0.002967, + "loss": 4.4838, + "step": 989 + }, + { + "epoch": 0.0099, + "grad_norm": 0.7329220852773792, + "learning_rate": 0.00297, + "loss": 4.5219, + "step": 990 + }, + { + "epoch": 0.00991, + "grad_norm": 0.7694030006138932, + "learning_rate": 0.002973, + "loss": 4.5213, + "step": 991 + }, + { + "epoch": 0.00992, + "grad_norm": 0.7726534251991994, + "learning_rate": 0.002976, + "loss": 4.5153, + "step": 992 + }, + { + "epoch": 0.00993, + "grad_norm": 0.6817474065224322, + "learning_rate": 0.002979, + "loss": 4.511, + "step": 993 + }, + { + "epoch": 0.00994, + "grad_norm": 0.6628189744120299, + "learning_rate": 0.002982, + "loss": 4.5078, + "step": 994 + }, + { + "epoch": 0.00995, + "grad_norm": 0.6249124245549155, + "learning_rate": 0.0029850000000000002, + "loss": 4.5069, + "step": 995 + }, + { + "epoch": 0.00996, + "grad_norm": 0.54278083452404, + "learning_rate": 0.002988, + "loss": 4.503, + "step": 996 + }, + { + "epoch": 0.00997, + "grad_norm": 0.5131542547273349, + "learning_rate": 0.002991, + "loss": 4.49, + "step": 997 + }, + { + "epoch": 0.00998, + "grad_norm": 0.4760166868407609, + "learning_rate": 0.002994, + "loss": 4.4895, + "step": 998 + }, + { + "epoch": 0.00999, + "grad_norm": 0.4855483581267517, + "learning_rate": 0.002997, + "loss": 4.4707, + "step": 999 + }, + { + "epoch": 0.01, + "grad_norm": 0.44929338024832627, + "learning_rate": 0.003, + "loss": 4.4897, + "step": 1000 + }, + { + "epoch": 0.01001, + "grad_norm": 0.3731978010457433, + "learning_rate": 0.003, + "loss": 4.4851, + "step": 1001 + }, + { + "epoch": 0.01002, + "grad_norm": 0.37113077487631946, + "learning_rate": 0.003, + "loss": 4.4516, + "step": 1002 + }, + { + "epoch": 0.01003, + "grad_norm": 0.35590481029592896, + "learning_rate": 0.003, + "loss": 4.4696, + "step": 1003 + }, + { + "epoch": 0.01004, + "grad_norm": 0.36275775448168, + "learning_rate": 0.003, + "loss": 4.4605, + "step": 1004 + }, + { + "epoch": 0.01005, + "grad_norm": 0.43407423243746507, + "learning_rate": 0.003, + "loss": 4.4329, + "step": 1005 + }, + { + "epoch": 0.01006, + "grad_norm": 0.5622897072513495, + "learning_rate": 0.003, + "loss": 4.4378, + "step": 1006 + }, + { + "epoch": 0.01007, + "grad_norm": 0.742851457120247, + "learning_rate": 0.003, + "loss": 4.4915, + "step": 1007 + }, + { + "epoch": 0.01008, + "grad_norm": 0.6898510838085624, + "learning_rate": 0.003, + "loss": 4.5119, + "step": 1008 + }, + { + "epoch": 0.01009, + "grad_norm": 0.5850313111790646, + "learning_rate": 0.003, + "loss": 4.4686, + "step": 1009 + }, + { + "epoch": 0.0101, + "grad_norm": 0.6689327239491644, + "learning_rate": 0.003, + "loss": 4.4833, + "step": 1010 + }, + { + "epoch": 0.01011, + "grad_norm": 0.6177431531224691, + "learning_rate": 0.003, + "loss": 4.4907, + "step": 1011 + }, + { + "epoch": 0.01012, + "grad_norm": 0.529876877427535, + "learning_rate": 0.003, + "loss": 4.4694, + "step": 1012 + }, + { + "epoch": 0.01013, + "grad_norm": 0.5027890853405218, + "learning_rate": 0.003, + "loss": 4.4915, + "step": 1013 + }, + { + "epoch": 0.01014, + "grad_norm": 0.4606528826158309, + "learning_rate": 0.003, + "loss": 4.4311, + "step": 1014 + }, + { + "epoch": 0.01015, + "grad_norm": 0.43413097561878167, + "learning_rate": 0.003, + "loss": 4.4729, + "step": 1015 + }, + { + "epoch": 0.01016, + "grad_norm": 0.45162191517129524, + "learning_rate": 0.003, + "loss": 4.4813, + "step": 1016 + }, + { + "epoch": 0.01017, + "grad_norm": 0.4151722379204413, + "learning_rate": 0.003, + "loss": 4.4194, + "step": 1017 + }, + { + "epoch": 0.01018, + "grad_norm": 0.42684710143489196, + "learning_rate": 0.003, + "loss": 4.4738, + "step": 1018 + }, + { + "epoch": 0.01019, + "grad_norm": 0.49455469810835373, + "learning_rate": 0.003, + "loss": 4.4546, + "step": 1019 + }, + { + "epoch": 0.0102, + "grad_norm": 0.44331421163833096, + "learning_rate": 0.003, + "loss": 4.4504, + "step": 1020 + }, + { + "epoch": 0.01021, + "grad_norm": 0.44293820189086996, + "learning_rate": 0.003, + "loss": 4.4351, + "step": 1021 + }, + { + "epoch": 0.01022, + "grad_norm": 0.471018484914238, + "learning_rate": 0.003, + "loss": 4.4454, + "step": 1022 + }, + { + "epoch": 0.01023, + "grad_norm": 0.5245843463151362, + "learning_rate": 0.003, + "loss": 4.4439, + "step": 1023 + }, + { + "epoch": 0.01024, + "grad_norm": 0.5471345601505091, + "learning_rate": 0.003, + "loss": 4.4059, + "step": 1024 + }, + { + "epoch": 0.01025, + "grad_norm": 0.5561682271484449, + "learning_rate": 0.003, + "loss": 4.4357, + "step": 1025 + }, + { + "epoch": 0.01026, + "grad_norm": 0.6750941633687737, + "learning_rate": 0.003, + "loss": 4.4343, + "step": 1026 + }, + { + "epoch": 0.01027, + "grad_norm": 0.6614940222952833, + "learning_rate": 0.003, + "loss": 4.4593, + "step": 1027 + }, + { + "epoch": 0.01028, + "grad_norm": 0.8232761879596748, + "learning_rate": 0.003, + "loss": 4.4444, + "step": 1028 + }, + { + "epoch": 0.01029, + "grad_norm": 0.926962960927152, + "learning_rate": 0.003, + "loss": 4.4584, + "step": 1029 + }, + { + "epoch": 0.0103, + "grad_norm": 0.7880165390972924, + "learning_rate": 0.003, + "loss": 4.4737, + "step": 1030 + }, + { + "epoch": 0.01031, + "grad_norm": 0.8244417100638626, + "learning_rate": 0.003, + "loss": 4.493, + "step": 1031 + }, + { + "epoch": 0.01032, + "grad_norm": 0.8457876785075499, + "learning_rate": 0.003, + "loss": 4.4747, + "step": 1032 + }, + { + "epoch": 0.01033, + "grad_norm": 0.7537115798125514, + "learning_rate": 0.003, + "loss": 4.5075, + "step": 1033 + }, + { + "epoch": 0.01034, + "grad_norm": 0.7955188357741166, + "learning_rate": 0.003, + "loss": 4.4759, + "step": 1034 + }, + { + "epoch": 0.01035, + "grad_norm": 0.807776556733491, + "learning_rate": 0.003, + "loss": 4.4793, + "step": 1035 + }, + { + "epoch": 0.01036, + "grad_norm": 0.8192211538243458, + "learning_rate": 0.003, + "loss": 4.4746, + "step": 1036 + }, + { + "epoch": 0.01037, + "grad_norm": 0.8125711495155172, + "learning_rate": 0.003, + "loss": 4.5016, + "step": 1037 + }, + { + "epoch": 0.01038, + "grad_norm": 0.8698834270185546, + "learning_rate": 0.003, + "loss": 4.5079, + "step": 1038 + }, + { + "epoch": 0.01039, + "grad_norm": 0.8070907960047083, + "learning_rate": 0.003, + "loss": 4.4962, + "step": 1039 + }, + { + "epoch": 0.0104, + "grad_norm": 0.8379908956267174, + "learning_rate": 0.003, + "loss": 4.5113, + "step": 1040 + }, + { + "epoch": 0.01041, + "grad_norm": 0.9467065153215795, + "learning_rate": 0.003, + "loss": 4.4994, + "step": 1041 + }, + { + "epoch": 0.01042, + "grad_norm": 1.0844879341171343, + "learning_rate": 0.003, + "loss": 4.5455, + "step": 1042 + }, + { + "epoch": 0.01043, + "grad_norm": 0.8223563094033745, + "learning_rate": 0.003, + "loss": 4.4732, + "step": 1043 + }, + { + "epoch": 0.01044, + "grad_norm": 0.7034538108566051, + "learning_rate": 0.003, + "loss": 4.5118, + "step": 1044 + }, + { + "epoch": 0.01045, + "grad_norm": 0.7046831935605008, + "learning_rate": 0.003, + "loss": 4.5282, + "step": 1045 + }, + { + "epoch": 0.01046, + "grad_norm": 0.5291717642722772, + "learning_rate": 0.003, + "loss": 4.4864, + "step": 1046 + }, + { + "epoch": 0.01047, + "grad_norm": 0.517981866453337, + "learning_rate": 0.003, + "loss": 4.4886, + "step": 1047 + }, + { + "epoch": 0.01048, + "grad_norm": 0.5100897334283181, + "learning_rate": 0.003, + "loss": 4.4775, + "step": 1048 + }, + { + "epoch": 0.01049, + "grad_norm": 0.4646755867304285, + "learning_rate": 0.003, + "loss": 4.4516, + "step": 1049 + }, + { + "epoch": 0.0105, + "grad_norm": 0.4688262914765259, + "learning_rate": 0.003, + "loss": 4.491, + "step": 1050 + }, + { + "epoch": 0.01051, + "grad_norm": 0.40479473066275506, + "learning_rate": 0.003, + "loss": 4.4613, + "step": 1051 + }, + { + "epoch": 0.01052, + "grad_norm": 0.37634413468362676, + "learning_rate": 0.003, + "loss": 4.4718, + "step": 1052 + }, + { + "epoch": 0.01053, + "grad_norm": 0.35380747787174, + "learning_rate": 0.003, + "loss": 4.4768, + "step": 1053 + }, + { + "epoch": 0.01054, + "grad_norm": 0.3233101912432746, + "learning_rate": 0.003, + "loss": 4.4657, + "step": 1054 + }, + { + "epoch": 0.01055, + "grad_norm": 0.3051647539319004, + "learning_rate": 0.003, + "loss": 4.4412, + "step": 1055 + }, + { + "epoch": 0.01056, + "grad_norm": 0.3183252525248436, + "learning_rate": 0.003, + "loss": 4.4395, + "step": 1056 + }, + { + "epoch": 0.01057, + "grad_norm": 0.32244230779772487, + "learning_rate": 0.003, + "loss": 4.443, + "step": 1057 + }, + { + "epoch": 0.01058, + "grad_norm": 0.37190643160876063, + "learning_rate": 0.003, + "loss": 4.4376, + "step": 1058 + }, + { + "epoch": 0.01059, + "grad_norm": 0.4982331351188646, + "learning_rate": 0.003, + "loss": 4.4406, + "step": 1059 + }, + { + "epoch": 0.0106, + "grad_norm": 0.630051966003807, + "learning_rate": 0.003, + "loss": 4.4377, + "step": 1060 + }, + { + "epoch": 0.01061, + "grad_norm": 0.6685664249418685, + "learning_rate": 0.003, + "loss": 4.4357, + "step": 1061 + }, + { + "epoch": 0.01062, + "grad_norm": 0.5702779525542006, + "learning_rate": 0.003, + "loss": 4.4281, + "step": 1062 + }, + { + "epoch": 0.01063, + "grad_norm": 0.47516871800908467, + "learning_rate": 0.003, + "loss": 4.4193, + "step": 1063 + }, + { + "epoch": 0.01064, + "grad_norm": 0.5362466950028499, + "learning_rate": 0.003, + "loss": 4.4381, + "step": 1064 + }, + { + "epoch": 0.01065, + "grad_norm": 0.5373139287833821, + "learning_rate": 0.003, + "loss": 4.4375, + "step": 1065 + }, + { + "epoch": 0.01066, + "grad_norm": 0.5486100821685685, + "learning_rate": 0.003, + "loss": 4.44, + "step": 1066 + }, + { + "epoch": 0.01067, + "grad_norm": 0.4816436794229303, + "learning_rate": 0.003, + "loss": 4.4454, + "step": 1067 + }, + { + "epoch": 0.01068, + "grad_norm": 0.429069056402875, + "learning_rate": 0.003, + "loss": 4.4286, + "step": 1068 + }, + { + "epoch": 0.01069, + "grad_norm": 0.4352935464081401, + "learning_rate": 0.003, + "loss": 4.4347, + "step": 1069 + }, + { + "epoch": 0.0107, + "grad_norm": 0.4882626803662278, + "learning_rate": 0.003, + "loss": 4.4874, + "step": 1070 + }, + { + "epoch": 0.01071, + "grad_norm": 0.599919651347176, + "learning_rate": 0.003, + "loss": 4.4109, + "step": 1071 + }, + { + "epoch": 0.01072, + "grad_norm": 0.6067483783044649, + "learning_rate": 0.003, + "loss": 4.4536, + "step": 1072 + }, + { + "epoch": 0.01073, + "grad_norm": 0.49239058441195105, + "learning_rate": 0.003, + "loss": 4.4412, + "step": 1073 + }, + { + "epoch": 0.01074, + "grad_norm": 0.49249754408010815, + "learning_rate": 0.003, + "loss": 4.4068, + "step": 1074 + }, + { + "epoch": 0.01075, + "grad_norm": 0.4103440289244663, + "learning_rate": 0.003, + "loss": 4.4418, + "step": 1075 + }, + { + "epoch": 0.01076, + "grad_norm": 0.41435198540181395, + "learning_rate": 0.003, + "loss": 4.4375, + "step": 1076 + }, + { + "epoch": 0.01077, + "grad_norm": 0.42961621912720355, + "learning_rate": 0.003, + "loss": 4.4322, + "step": 1077 + }, + { + "epoch": 0.01078, + "grad_norm": 0.47858570596245864, + "learning_rate": 0.003, + "loss": 4.3877, + "step": 1078 + }, + { + "epoch": 0.01079, + "grad_norm": 0.5533382064380803, + "learning_rate": 0.003, + "loss": 4.4233, + "step": 1079 + }, + { + "epoch": 0.0108, + "grad_norm": 0.6417480625144324, + "learning_rate": 0.003, + "loss": 4.4054, + "step": 1080 + }, + { + "epoch": 0.01081, + "grad_norm": 0.7431146939801978, + "learning_rate": 0.003, + "loss": 4.4226, + "step": 1081 + }, + { + "epoch": 0.01082, + "grad_norm": 0.9014359739424833, + "learning_rate": 0.003, + "loss": 4.4534, + "step": 1082 + }, + { + "epoch": 0.01083, + "grad_norm": 0.8675952653065004, + "learning_rate": 0.003, + "loss": 4.4518, + "step": 1083 + }, + { + "epoch": 0.01084, + "grad_norm": 0.7255782707521201, + "learning_rate": 0.003, + "loss": 4.448, + "step": 1084 + }, + { + "epoch": 0.01085, + "grad_norm": 0.6450755897364046, + "learning_rate": 0.003, + "loss": 4.4349, + "step": 1085 + }, + { + "epoch": 0.01086, + "grad_norm": 0.6814430736440221, + "learning_rate": 0.003, + "loss": 4.4125, + "step": 1086 + }, + { + "epoch": 0.01087, + "grad_norm": 0.6399587640180319, + "learning_rate": 0.003, + "loss": 4.4286, + "step": 1087 + }, + { + "epoch": 0.01088, + "grad_norm": 0.5625590086017324, + "learning_rate": 0.003, + "loss": 4.4612, + "step": 1088 + }, + { + "epoch": 0.01089, + "grad_norm": 0.5476518731942144, + "learning_rate": 0.003, + "loss": 4.4282, + "step": 1089 + }, + { + "epoch": 0.0109, + "grad_norm": 0.6962765199988842, + "learning_rate": 0.003, + "loss": 4.4603, + "step": 1090 + }, + { + "epoch": 0.01091, + "grad_norm": 0.7992579884725848, + "learning_rate": 0.003, + "loss": 4.4391, + "step": 1091 + }, + { + "epoch": 0.01092, + "grad_norm": 0.6922142003261247, + "learning_rate": 0.003, + "loss": 4.43, + "step": 1092 + }, + { + "epoch": 0.01093, + "grad_norm": 0.7201776431280809, + "learning_rate": 0.003, + "loss": 4.4545, + "step": 1093 + }, + { + "epoch": 0.01094, + "grad_norm": 0.7324411775727792, + "learning_rate": 0.003, + "loss": 4.4431, + "step": 1094 + }, + { + "epoch": 0.01095, + "grad_norm": 0.7356072077097602, + "learning_rate": 0.003, + "loss": 4.4704, + "step": 1095 + }, + { + "epoch": 0.01096, + "grad_norm": 0.7577305220019848, + "learning_rate": 0.003, + "loss": 4.445, + "step": 1096 + }, + { + "epoch": 0.01097, + "grad_norm": 0.7290952400289119, + "learning_rate": 0.003, + "loss": 4.4352, + "step": 1097 + }, + { + "epoch": 0.01098, + "grad_norm": 0.8085780813837044, + "learning_rate": 0.003, + "loss": 4.4555, + "step": 1098 + }, + { + "epoch": 0.01099, + "grad_norm": 0.8944752292785944, + "learning_rate": 0.003, + "loss": 4.4361, + "step": 1099 + }, + { + "epoch": 0.011, + "grad_norm": 0.7875559243106586, + "learning_rate": 0.003, + "loss": 4.4348, + "step": 1100 + }, + { + "epoch": 0.01101, + "grad_norm": 0.8589536043002628, + "learning_rate": 0.003, + "loss": 4.4539, + "step": 1101 + }, + { + "epoch": 0.01102, + "grad_norm": 0.8464001238353072, + "learning_rate": 0.003, + "loss": 4.4762, + "step": 1102 + }, + { + "epoch": 0.01103, + "grad_norm": 0.8851736378981229, + "learning_rate": 0.003, + "loss": 4.4662, + "step": 1103 + }, + { + "epoch": 0.01104, + "grad_norm": 0.7153611537759403, + "learning_rate": 0.003, + "loss": 4.4545, + "step": 1104 + }, + { + "epoch": 0.01105, + "grad_norm": 0.5180783683134489, + "learning_rate": 0.003, + "loss": 4.4659, + "step": 1105 + }, + { + "epoch": 0.01106, + "grad_norm": 0.5726422216533474, + "learning_rate": 0.003, + "loss": 4.4387, + "step": 1106 + }, + { + "epoch": 0.01107, + "grad_norm": 0.5702919336798088, + "learning_rate": 0.003, + "loss": 4.4468, + "step": 1107 + }, + { + "epoch": 0.01108, + "grad_norm": 0.48082946780834995, + "learning_rate": 0.003, + "loss": 4.4114, + "step": 1108 + }, + { + "epoch": 0.01109, + "grad_norm": 0.48270985447367, + "learning_rate": 0.003, + "loss": 4.4251, + "step": 1109 + }, + { + "epoch": 0.0111, + "grad_norm": 0.521002318978107, + "learning_rate": 0.003, + "loss": 4.4465, + "step": 1110 + }, + { + "epoch": 0.01111, + "grad_norm": 0.5366151298263073, + "learning_rate": 0.003, + "loss": 4.4215, + "step": 1111 + }, + { + "epoch": 0.01112, + "grad_norm": 0.4687552317220928, + "learning_rate": 0.003, + "loss": 4.4366, + "step": 1112 + }, + { + "epoch": 0.01113, + "grad_norm": 0.41403168260550727, + "learning_rate": 0.003, + "loss": 4.403, + "step": 1113 + }, + { + "epoch": 0.01114, + "grad_norm": 0.4499817067649614, + "learning_rate": 0.003, + "loss": 4.4155, + "step": 1114 + }, + { + "epoch": 0.01115, + "grad_norm": 0.5079887140928414, + "learning_rate": 0.003, + "loss": 4.4407, + "step": 1115 + }, + { + "epoch": 0.01116, + "grad_norm": 0.4835939068607683, + "learning_rate": 0.003, + "loss": 4.4267, + "step": 1116 + }, + { + "epoch": 0.01117, + "grad_norm": 0.4596276551432057, + "learning_rate": 0.003, + "loss": 4.4124, + "step": 1117 + }, + { + "epoch": 0.01118, + "grad_norm": 0.4299051580324855, + "learning_rate": 0.003, + "loss": 4.414, + "step": 1118 + }, + { + "epoch": 0.01119, + "grad_norm": 0.43855200803943045, + "learning_rate": 0.003, + "loss": 4.4183, + "step": 1119 + }, + { + "epoch": 0.0112, + "grad_norm": 0.47045620681482975, + "learning_rate": 0.003, + "loss": 4.4017, + "step": 1120 + }, + { + "epoch": 0.01121, + "grad_norm": 0.47984890763909716, + "learning_rate": 0.003, + "loss": 4.4142, + "step": 1121 + }, + { + "epoch": 0.01122, + "grad_norm": 0.4953392048385066, + "learning_rate": 0.003, + "loss": 4.3919, + "step": 1122 + }, + { + "epoch": 0.01123, + "grad_norm": 0.5318471574599554, + "learning_rate": 0.003, + "loss": 4.3799, + "step": 1123 + }, + { + "epoch": 0.01124, + "grad_norm": 0.5664861125554082, + "learning_rate": 0.003, + "loss": 4.4277, + "step": 1124 + }, + { + "epoch": 0.01125, + "grad_norm": 0.5893995505624438, + "learning_rate": 0.003, + "loss": 4.3995, + "step": 1125 + }, + { + "epoch": 0.01126, + "grad_norm": 0.5624864569583264, + "learning_rate": 0.003, + "loss": 4.4071, + "step": 1126 + }, + { + "epoch": 0.01127, + "grad_norm": 0.6840590538183918, + "learning_rate": 0.003, + "loss": 4.442, + "step": 1127 + }, + { + "epoch": 0.01128, + "grad_norm": 0.6290952877293927, + "learning_rate": 0.003, + "loss": 4.4221, + "step": 1128 + }, + { + "epoch": 0.01129, + "grad_norm": 0.5439309858740465, + "learning_rate": 0.003, + "loss": 4.4089, + "step": 1129 + }, + { + "epoch": 0.0113, + "grad_norm": 0.5640044781984855, + "learning_rate": 0.003, + "loss": 4.3915, + "step": 1130 + }, + { + "epoch": 0.01131, + "grad_norm": 0.5291031908216413, + "learning_rate": 0.003, + "loss": 4.4024, + "step": 1131 + }, + { + "epoch": 0.01132, + "grad_norm": 0.5330070088798269, + "learning_rate": 0.003, + "loss": 4.4286, + "step": 1132 + }, + { + "epoch": 0.01133, + "grad_norm": 0.49343294212766914, + "learning_rate": 0.003, + "loss": 4.387, + "step": 1133 + }, + { + "epoch": 0.01134, + "grad_norm": 0.5456485707169718, + "learning_rate": 0.003, + "loss": 4.4014, + "step": 1134 + }, + { + "epoch": 0.01135, + "grad_norm": 0.5446260165807105, + "learning_rate": 0.003, + "loss": 4.4306, + "step": 1135 + }, + { + "epoch": 0.01136, + "grad_norm": 0.4995675256097383, + "learning_rate": 0.003, + "loss": 4.4015, + "step": 1136 + }, + { + "epoch": 0.01137, + "grad_norm": 0.5676213324650841, + "learning_rate": 0.003, + "loss": 4.4302, + "step": 1137 + }, + { + "epoch": 0.01138, + "grad_norm": 0.5808574548770593, + "learning_rate": 0.003, + "loss": 4.3883, + "step": 1138 + }, + { + "epoch": 0.01139, + "grad_norm": 0.5783839384535551, + "learning_rate": 0.003, + "loss": 4.3964, + "step": 1139 + }, + { + "epoch": 0.0114, + "grad_norm": 0.5416297129288978, + "learning_rate": 0.003, + "loss": 4.3794, + "step": 1140 + }, + { + "epoch": 0.01141, + "grad_norm": 0.5159183474876468, + "learning_rate": 0.003, + "loss": 4.4038, + "step": 1141 + }, + { + "epoch": 0.01142, + "grad_norm": 0.5569718794922074, + "learning_rate": 0.003, + "loss": 4.3824, + "step": 1142 + }, + { + "epoch": 0.01143, + "grad_norm": 0.5472188962598419, + "learning_rate": 0.003, + "loss": 4.3984, + "step": 1143 + }, + { + "epoch": 0.01144, + "grad_norm": 0.5020345156310454, + "learning_rate": 0.003, + "loss": 4.3625, + "step": 1144 + }, + { + "epoch": 0.01145, + "grad_norm": 0.5548649777515073, + "learning_rate": 0.003, + "loss": 4.4027, + "step": 1145 + }, + { + "epoch": 0.01146, + "grad_norm": 0.5316552996009986, + "learning_rate": 0.003, + "loss": 4.3864, + "step": 1146 + }, + { + "epoch": 0.01147, + "grad_norm": 0.580862846664092, + "learning_rate": 0.003, + "loss": 4.3809, + "step": 1147 + }, + { + "epoch": 0.01148, + "grad_norm": 0.6205202997095316, + "learning_rate": 0.003, + "loss": 4.3827, + "step": 1148 + }, + { + "epoch": 0.01149, + "grad_norm": 0.6731514287407836, + "learning_rate": 0.003, + "loss": 4.4075, + "step": 1149 + }, + { + "epoch": 0.0115, + "grad_norm": 0.8717295567714907, + "learning_rate": 0.003, + "loss": 4.4098, + "step": 1150 + }, + { + "epoch": 0.01151, + "grad_norm": 0.8032465834892395, + "learning_rate": 0.003, + "loss": 4.4148, + "step": 1151 + }, + { + "epoch": 0.01152, + "grad_norm": 0.7381597286505803, + "learning_rate": 0.003, + "loss": 4.4217, + "step": 1152 + }, + { + "epoch": 0.01153, + "grad_norm": 0.7429010521184413, + "learning_rate": 0.003, + "loss": 4.4395, + "step": 1153 + }, + { + "epoch": 0.01154, + "grad_norm": 0.6900014831219232, + "learning_rate": 0.003, + "loss": 4.4154, + "step": 1154 + }, + { + "epoch": 0.01155, + "grad_norm": 0.64430413026539, + "learning_rate": 0.003, + "loss": 4.4019, + "step": 1155 + }, + { + "epoch": 0.01156, + "grad_norm": 0.5602174601436266, + "learning_rate": 0.003, + "loss": 4.3999, + "step": 1156 + }, + { + "epoch": 0.01157, + "grad_norm": 0.5870036133781706, + "learning_rate": 0.003, + "loss": 4.4301, + "step": 1157 + }, + { + "epoch": 0.01158, + "grad_norm": 0.49506132828209426, + "learning_rate": 0.003, + "loss": 4.3945, + "step": 1158 + }, + { + "epoch": 0.01159, + "grad_norm": 0.4315860669439085, + "learning_rate": 0.003, + "loss": 4.3852, + "step": 1159 + }, + { + "epoch": 0.0116, + "grad_norm": 0.48532942177035343, + "learning_rate": 0.003, + "loss": 4.4026, + "step": 1160 + }, + { + "epoch": 0.01161, + "grad_norm": 0.441658698740049, + "learning_rate": 0.003, + "loss": 4.3739, + "step": 1161 + }, + { + "epoch": 0.01162, + "grad_norm": 0.4784702532673203, + "learning_rate": 0.003, + "loss": 4.3867, + "step": 1162 + }, + { + "epoch": 0.01163, + "grad_norm": 0.44588993451057907, + "learning_rate": 0.003, + "loss": 4.4046, + "step": 1163 + }, + { + "epoch": 0.01164, + "grad_norm": 0.3982553498496815, + "learning_rate": 0.003, + "loss": 4.3917, + "step": 1164 + }, + { + "epoch": 0.01165, + "grad_norm": 0.3715052589980553, + "learning_rate": 0.003, + "loss": 4.3843, + "step": 1165 + }, + { + "epoch": 0.01166, + "grad_norm": 0.35254776690075756, + "learning_rate": 0.003, + "loss": 4.3817, + "step": 1166 + }, + { + "epoch": 0.01167, + "grad_norm": 0.41049866169548815, + "learning_rate": 0.003, + "loss": 4.3499, + "step": 1167 + }, + { + "epoch": 0.01168, + "grad_norm": 0.43814969571755685, + "learning_rate": 0.003, + "loss": 4.3577, + "step": 1168 + }, + { + "epoch": 0.01169, + "grad_norm": 0.5199727636769494, + "learning_rate": 0.003, + "loss": 4.401, + "step": 1169 + }, + { + "epoch": 0.0117, + "grad_norm": 0.7308392317868168, + "learning_rate": 0.003, + "loss": 4.3971, + "step": 1170 + }, + { + "epoch": 0.01171, + "grad_norm": 0.8534230998766525, + "learning_rate": 0.003, + "loss": 4.3895, + "step": 1171 + }, + { + "epoch": 0.01172, + "grad_norm": 0.7264855860795836, + "learning_rate": 0.003, + "loss": 4.3801, + "step": 1172 + }, + { + "epoch": 0.01173, + "grad_norm": 0.7253809298036523, + "learning_rate": 0.003, + "loss": 4.4199, + "step": 1173 + }, + { + "epoch": 0.01174, + "grad_norm": 0.7347248301973536, + "learning_rate": 0.003, + "loss": 4.3956, + "step": 1174 + }, + { + "epoch": 0.01175, + "grad_norm": 0.837594156223567, + "learning_rate": 0.003, + "loss": 4.3885, + "step": 1175 + }, + { + "epoch": 0.01176, + "grad_norm": 0.6959008759390608, + "learning_rate": 0.003, + "loss": 4.4162, + "step": 1176 + }, + { + "epoch": 0.01177, + "grad_norm": 0.7136269362957118, + "learning_rate": 0.003, + "loss": 4.3914, + "step": 1177 + }, + { + "epoch": 0.01178, + "grad_norm": 0.5923303437356635, + "learning_rate": 0.003, + "loss": 4.4002, + "step": 1178 + }, + { + "epoch": 0.01179, + "grad_norm": 0.5305527199591268, + "learning_rate": 0.003, + "loss": 4.3783, + "step": 1179 + }, + { + "epoch": 0.0118, + "grad_norm": 0.5526769016584179, + "learning_rate": 0.003, + "loss": 4.4415, + "step": 1180 + }, + { + "epoch": 0.01181, + "grad_norm": 0.603168145699105, + "learning_rate": 0.003, + "loss": 4.4326, + "step": 1181 + }, + { + "epoch": 0.01182, + "grad_norm": 0.5234478564139211, + "learning_rate": 0.003, + "loss": 4.3879, + "step": 1182 + }, + { + "epoch": 0.01183, + "grad_norm": 0.571647059903747, + "learning_rate": 0.003, + "loss": 4.3807, + "step": 1183 + }, + { + "epoch": 0.01184, + "grad_norm": 0.6778225586382575, + "learning_rate": 0.003, + "loss": 4.4261, + "step": 1184 + }, + { + "epoch": 0.01185, + "grad_norm": 0.7498329664059358, + "learning_rate": 0.003, + "loss": 4.4017, + "step": 1185 + }, + { + "epoch": 0.01186, + "grad_norm": 0.7043825932339357, + "learning_rate": 0.003, + "loss": 4.4247, + "step": 1186 + }, + { + "epoch": 0.01187, + "grad_norm": 0.6471541142959871, + "learning_rate": 0.003, + "loss": 4.3867, + "step": 1187 + }, + { + "epoch": 0.01188, + "grad_norm": 0.498120631856624, + "learning_rate": 0.003, + "loss": 4.3947, + "step": 1188 + }, + { + "epoch": 0.01189, + "grad_norm": 0.4776874390876697, + "learning_rate": 0.003, + "loss": 4.3881, + "step": 1189 + }, + { + "epoch": 0.0119, + "grad_norm": 0.5052480096056097, + "learning_rate": 0.003, + "loss": 4.3727, + "step": 1190 + }, + { + "epoch": 0.01191, + "grad_norm": 0.554587298838607, + "learning_rate": 0.003, + "loss": 4.3534, + "step": 1191 + }, + { + "epoch": 0.01192, + "grad_norm": 0.652885110576955, + "learning_rate": 0.003, + "loss": 4.3938, + "step": 1192 + }, + { + "epoch": 0.01193, + "grad_norm": 0.814652753692073, + "learning_rate": 0.003, + "loss": 4.4033, + "step": 1193 + }, + { + "epoch": 0.01194, + "grad_norm": 0.8149173654121779, + "learning_rate": 0.003, + "loss": 4.4185, + "step": 1194 + }, + { + "epoch": 0.01195, + "grad_norm": 0.770137476772292, + "learning_rate": 0.003, + "loss": 4.3916, + "step": 1195 + }, + { + "epoch": 0.01196, + "grad_norm": 0.7050379864259693, + "learning_rate": 0.003, + "loss": 4.3964, + "step": 1196 + }, + { + "epoch": 0.01197, + "grad_norm": 0.5988825851814717, + "learning_rate": 0.003, + "loss": 4.3556, + "step": 1197 + }, + { + "epoch": 0.01198, + "grad_norm": 0.5626266858189237, + "learning_rate": 0.003, + "loss": 4.4296, + "step": 1198 + }, + { + "epoch": 0.01199, + "grad_norm": 0.5539141200560117, + "learning_rate": 0.003, + "loss": 4.4003, + "step": 1199 + }, + { + "epoch": 0.012, + "grad_norm": 0.446871069304691, + "learning_rate": 0.003, + "loss": 4.3872, + "step": 1200 + }, + { + "epoch": 0.01201, + "grad_norm": 0.41098308583368987, + "learning_rate": 0.003, + "loss": 4.4145, + "step": 1201 + }, + { + "epoch": 0.01202, + "grad_norm": 0.40482114944892034, + "learning_rate": 0.003, + "loss": 4.3801, + "step": 1202 + }, + { + "epoch": 0.01203, + "grad_norm": 0.3801203709283518, + "learning_rate": 0.003, + "loss": 4.3878, + "step": 1203 + }, + { + "epoch": 0.01204, + "grad_norm": 0.4044230744585578, + "learning_rate": 0.003, + "loss": 4.3855, + "step": 1204 + }, + { + "epoch": 0.01205, + "grad_norm": 0.4153028028477695, + "learning_rate": 0.003, + "loss": 4.3753, + "step": 1205 + }, + { + "epoch": 0.01206, + "grad_norm": 0.4253601027948775, + "learning_rate": 0.003, + "loss": 4.4192, + "step": 1206 + }, + { + "epoch": 0.01207, + "grad_norm": 0.42584001554399725, + "learning_rate": 0.003, + "loss": 4.3415, + "step": 1207 + }, + { + "epoch": 0.01208, + "grad_norm": 0.38034721696084095, + "learning_rate": 0.003, + "loss": 4.3983, + "step": 1208 + }, + { + "epoch": 0.01209, + "grad_norm": 0.38812168043845363, + "learning_rate": 0.003, + "loss": 4.3807, + "step": 1209 + }, + { + "epoch": 0.0121, + "grad_norm": 0.48056015790597395, + "learning_rate": 0.003, + "loss": 4.3838, + "step": 1210 + }, + { + "epoch": 0.01211, + "grad_norm": 0.5637402598516663, + "learning_rate": 0.003, + "loss": 4.3728, + "step": 1211 + }, + { + "epoch": 0.01212, + "grad_norm": 0.678908053443676, + "learning_rate": 0.003, + "loss": 4.4009, + "step": 1212 + }, + { + "epoch": 0.01213, + "grad_norm": 0.7872996276111477, + "learning_rate": 0.003, + "loss": 4.3904, + "step": 1213 + }, + { + "epoch": 0.01214, + "grad_norm": 0.737245569468942, + "learning_rate": 0.003, + "loss": 4.3712, + "step": 1214 + }, + { + "epoch": 0.01215, + "grad_norm": 0.7444446627290159, + "learning_rate": 0.003, + "loss": 4.3825, + "step": 1215 + }, + { + "epoch": 0.01216, + "grad_norm": 0.7493613956712142, + "learning_rate": 0.003, + "loss": 4.3829, + "step": 1216 + }, + { + "epoch": 0.01217, + "grad_norm": 0.7355810094680961, + "learning_rate": 0.003, + "loss": 4.3915, + "step": 1217 + }, + { + "epoch": 0.01218, + "grad_norm": 0.8012910697526976, + "learning_rate": 0.003, + "loss": 4.3871, + "step": 1218 + }, + { + "epoch": 0.01219, + "grad_norm": 0.8693012687676964, + "learning_rate": 0.003, + "loss": 4.3988, + "step": 1219 + }, + { + "epoch": 0.0122, + "grad_norm": 0.8958839342651639, + "learning_rate": 0.003, + "loss": 4.3996, + "step": 1220 + }, + { + "epoch": 0.01221, + "grad_norm": 1.0403569844700198, + "learning_rate": 0.003, + "loss": 4.4291, + "step": 1221 + }, + { + "epoch": 0.01222, + "grad_norm": 1.1355134224594017, + "learning_rate": 0.003, + "loss": 4.4293, + "step": 1222 + }, + { + "epoch": 0.01223, + "grad_norm": 0.9630650396281519, + "learning_rate": 0.003, + "loss": 4.4167, + "step": 1223 + }, + { + "epoch": 0.01224, + "grad_norm": 0.8763852343022005, + "learning_rate": 0.003, + "loss": 4.4062, + "step": 1224 + }, + { + "epoch": 0.01225, + "grad_norm": 0.8983930620848783, + "learning_rate": 0.003, + "loss": 4.4328, + "step": 1225 + }, + { + "epoch": 0.01226, + "grad_norm": 0.7843004711214445, + "learning_rate": 0.003, + "loss": 4.4231, + "step": 1226 + }, + { + "epoch": 0.01227, + "grad_norm": 0.6802825597762372, + "learning_rate": 0.003, + "loss": 4.4455, + "step": 1227 + }, + { + "epoch": 0.01228, + "grad_norm": 0.7175774602889506, + "learning_rate": 0.003, + "loss": 4.433, + "step": 1228 + }, + { + "epoch": 0.01229, + "grad_norm": 0.7124552785986754, + "learning_rate": 0.003, + "loss": 4.4177, + "step": 1229 + }, + { + "epoch": 0.0123, + "grad_norm": 0.7173646190359848, + "learning_rate": 0.003, + "loss": 4.4462, + "step": 1230 + }, + { + "epoch": 0.01231, + "grad_norm": 0.6710417652647681, + "learning_rate": 0.003, + "loss": 4.4192, + "step": 1231 + }, + { + "epoch": 0.01232, + "grad_norm": 0.5319364475378244, + "learning_rate": 0.003, + "loss": 4.3837, + "step": 1232 + }, + { + "epoch": 0.01233, + "grad_norm": 0.5107726510651173, + "learning_rate": 0.003, + "loss": 4.3892, + "step": 1233 + }, + { + "epoch": 0.01234, + "grad_norm": 0.525523603676889, + "learning_rate": 0.003, + "loss": 4.4032, + "step": 1234 + }, + { + "epoch": 0.01235, + "grad_norm": 0.4385933034868044, + "learning_rate": 0.003, + "loss": 4.3895, + "step": 1235 + }, + { + "epoch": 0.01236, + "grad_norm": 0.4417827360571799, + "learning_rate": 0.003, + "loss": 4.3922, + "step": 1236 + }, + { + "epoch": 0.01237, + "grad_norm": 0.3870170981569727, + "learning_rate": 0.003, + "loss": 4.3983, + "step": 1237 + }, + { + "epoch": 0.01238, + "grad_norm": 0.3450028372069669, + "learning_rate": 0.003, + "loss": 4.3918, + "step": 1238 + }, + { + "epoch": 0.01239, + "grad_norm": 0.32255856446948694, + "learning_rate": 0.003, + "loss": 4.3727, + "step": 1239 + }, + { + "epoch": 0.0124, + "grad_norm": 0.3078475989244171, + "learning_rate": 0.003, + "loss": 4.3537, + "step": 1240 + }, + { + "epoch": 0.01241, + "grad_norm": 0.34649646171341814, + "learning_rate": 0.003, + "loss": 4.3646, + "step": 1241 + }, + { + "epoch": 0.01242, + "grad_norm": 0.3965097983444254, + "learning_rate": 0.003, + "loss": 4.3706, + "step": 1242 + }, + { + "epoch": 0.01243, + "grad_norm": 0.4535687938279078, + "learning_rate": 0.003, + "loss": 4.3517, + "step": 1243 + }, + { + "epoch": 0.01244, + "grad_norm": 0.5186172619894055, + "learning_rate": 0.003, + "loss": 4.3669, + "step": 1244 + }, + { + "epoch": 0.01245, + "grad_norm": 0.6425018812487958, + "learning_rate": 0.003, + "loss": 4.363, + "step": 1245 + }, + { + "epoch": 0.01246, + "grad_norm": 0.6245723386860975, + "learning_rate": 0.003, + "loss": 4.3446, + "step": 1246 + }, + { + "epoch": 0.01247, + "grad_norm": 0.44377555171018157, + "learning_rate": 0.003, + "loss": 4.3915, + "step": 1247 + }, + { + "epoch": 0.01248, + "grad_norm": 0.5278029739976049, + "learning_rate": 0.003, + "loss": 4.3761, + "step": 1248 + }, + { + "epoch": 0.01249, + "grad_norm": 0.5102876241848929, + "learning_rate": 0.003, + "loss": 4.3677, + "step": 1249 + }, + { + "epoch": 0.0125, + "grad_norm": 0.39559739569893493, + "learning_rate": 0.003, + "loss": 4.3348, + "step": 1250 + }, + { + "epoch": 0.01251, + "grad_norm": 0.4774815455759189, + "learning_rate": 0.003, + "loss": 4.373, + "step": 1251 + }, + { + "epoch": 0.01252, + "grad_norm": 0.40459109080676114, + "learning_rate": 0.003, + "loss": 4.3467, + "step": 1252 + }, + { + "epoch": 0.01253, + "grad_norm": 0.3506155695588933, + "learning_rate": 0.003, + "loss": 4.3238, + "step": 1253 + }, + { + "epoch": 0.01254, + "grad_norm": 0.36046868647854385, + "learning_rate": 0.003, + "loss": 4.3697, + "step": 1254 + }, + { + "epoch": 0.01255, + "grad_norm": 0.400469314411457, + "learning_rate": 0.003, + "loss": 4.3202, + "step": 1255 + }, + { + "epoch": 0.01256, + "grad_norm": 0.42579169102032727, + "learning_rate": 0.003, + "loss": 4.3478, + "step": 1256 + }, + { + "epoch": 0.01257, + "grad_norm": 0.43666374954115567, + "learning_rate": 0.003, + "loss": 4.3342, + "step": 1257 + }, + { + "epoch": 0.01258, + "grad_norm": 0.3885559357601015, + "learning_rate": 0.003, + "loss": 4.3411, + "step": 1258 + }, + { + "epoch": 0.01259, + "grad_norm": 0.3450685274689168, + "learning_rate": 0.003, + "loss": 4.3417, + "step": 1259 + }, + { + "epoch": 0.0126, + "grad_norm": 0.3296913229590147, + "learning_rate": 0.003, + "loss": 4.3689, + "step": 1260 + }, + { + "epoch": 0.01261, + "grad_norm": 0.3469234561542926, + "learning_rate": 0.003, + "loss": 4.3628, + "step": 1261 + }, + { + "epoch": 0.01262, + "grad_norm": 0.40339400226308325, + "learning_rate": 0.003, + "loss": 4.3558, + "step": 1262 + }, + { + "epoch": 0.01263, + "grad_norm": 0.44597422220213867, + "learning_rate": 0.003, + "loss": 4.3512, + "step": 1263 + }, + { + "epoch": 0.01264, + "grad_norm": 0.4427407408205576, + "learning_rate": 0.003, + "loss": 4.338, + "step": 1264 + }, + { + "epoch": 0.01265, + "grad_norm": 0.41573445698488437, + "learning_rate": 0.003, + "loss": 4.3506, + "step": 1265 + }, + { + "epoch": 0.01266, + "grad_norm": 0.45186560215837035, + "learning_rate": 0.003, + "loss": 4.349, + "step": 1266 + }, + { + "epoch": 0.01267, + "grad_norm": 0.6290443655616889, + "learning_rate": 0.003, + "loss": 4.3767, + "step": 1267 + }, + { + "epoch": 0.01268, + "grad_norm": 0.8841541603744382, + "learning_rate": 0.003, + "loss": 4.3763, + "step": 1268 + }, + { + "epoch": 0.01269, + "grad_norm": 0.9324063273299563, + "learning_rate": 0.003, + "loss": 4.3633, + "step": 1269 + }, + { + "epoch": 0.0127, + "grad_norm": 0.7531387064097018, + "learning_rate": 0.003, + "loss": 4.389, + "step": 1270 + }, + { + "epoch": 0.01271, + "grad_norm": 0.7173511219844001, + "learning_rate": 0.003, + "loss": 4.3689, + "step": 1271 + }, + { + "epoch": 0.01272, + "grad_norm": 0.6910307872241689, + "learning_rate": 0.003, + "loss": 4.3626, + "step": 1272 + }, + { + "epoch": 0.01273, + "grad_norm": 0.6392091020786222, + "learning_rate": 0.003, + "loss": 4.3902, + "step": 1273 + }, + { + "epoch": 0.01274, + "grad_norm": 0.5683923983758625, + "learning_rate": 0.003, + "loss": 4.3811, + "step": 1274 + }, + { + "epoch": 0.01275, + "grad_norm": 0.48501080462586865, + "learning_rate": 0.003, + "loss": 4.3433, + "step": 1275 + }, + { + "epoch": 0.01276, + "grad_norm": 0.4388665728327947, + "learning_rate": 0.003, + "loss": 4.3487, + "step": 1276 + }, + { + "epoch": 0.01277, + "grad_norm": 0.5024100232553123, + "learning_rate": 0.003, + "loss": 4.358, + "step": 1277 + }, + { + "epoch": 0.01278, + "grad_norm": 0.5072917507898231, + "learning_rate": 0.003, + "loss": 4.3646, + "step": 1278 + }, + { + "epoch": 0.01279, + "grad_norm": 0.48942775338603506, + "learning_rate": 0.003, + "loss": 4.3461, + "step": 1279 + }, + { + "epoch": 0.0128, + "grad_norm": 0.5180680805997396, + "learning_rate": 0.003, + "loss": 4.3453, + "step": 1280 + }, + { + "epoch": 0.01281, + "grad_norm": 0.589882368752518, + "learning_rate": 0.003, + "loss": 4.3309, + "step": 1281 + }, + { + "epoch": 0.01282, + "grad_norm": 0.7217688053955748, + "learning_rate": 0.003, + "loss": 4.3833, + "step": 1282 + }, + { + "epoch": 0.01283, + "grad_norm": 0.849897015093446, + "learning_rate": 0.003, + "loss": 4.3595, + "step": 1283 + }, + { + "epoch": 0.01284, + "grad_norm": 0.769319717001856, + "learning_rate": 0.003, + "loss": 4.3713, + "step": 1284 + }, + { + "epoch": 0.01285, + "grad_norm": 0.7372234573055417, + "learning_rate": 0.003, + "loss": 4.3679, + "step": 1285 + }, + { + "epoch": 0.01286, + "grad_norm": 0.6185984538132406, + "learning_rate": 0.003, + "loss": 4.3884, + "step": 1286 + }, + { + "epoch": 0.01287, + "grad_norm": 0.566489365455937, + "learning_rate": 0.003, + "loss": 4.3487, + "step": 1287 + }, + { + "epoch": 0.01288, + "grad_norm": 0.583268123970244, + "learning_rate": 0.003, + "loss": 4.3654, + "step": 1288 + }, + { + "epoch": 0.01289, + "grad_norm": 0.5365604798494432, + "learning_rate": 0.003, + "loss": 4.3546, + "step": 1289 + }, + { + "epoch": 0.0129, + "grad_norm": 0.5898461372899422, + "learning_rate": 0.003, + "loss": 4.3784, + "step": 1290 + }, + { + "epoch": 0.01291, + "grad_norm": 0.7454645554405436, + "learning_rate": 0.003, + "loss": 4.3672, + "step": 1291 + }, + { + "epoch": 0.01292, + "grad_norm": 0.8872276988300176, + "learning_rate": 0.003, + "loss": 4.4157, + "step": 1292 + }, + { + "epoch": 0.01293, + "grad_norm": 0.8660558149247537, + "learning_rate": 0.003, + "loss": 4.4053, + "step": 1293 + }, + { + "epoch": 0.01294, + "grad_norm": 0.7869826032444561, + "learning_rate": 0.003, + "loss": 4.3679, + "step": 1294 + }, + { + "epoch": 0.01295, + "grad_norm": 0.7577761481466739, + "learning_rate": 0.003, + "loss": 4.3975, + "step": 1295 + }, + { + "epoch": 0.01296, + "grad_norm": 0.6748799445050047, + "learning_rate": 0.003, + "loss": 4.3598, + "step": 1296 + }, + { + "epoch": 0.01297, + "grad_norm": 0.670162676491956, + "learning_rate": 0.003, + "loss": 4.376, + "step": 1297 + }, + { + "epoch": 0.01298, + "grad_norm": 0.5691583859101702, + "learning_rate": 0.003, + "loss": 4.3472, + "step": 1298 + }, + { + "epoch": 0.01299, + "grad_norm": 0.5072926331461023, + "learning_rate": 0.003, + "loss": 4.3555, + "step": 1299 + }, + { + "epoch": 0.013, + "grad_norm": 0.4828769685141814, + "learning_rate": 0.003, + "loss": 4.343, + "step": 1300 + }, + { + "epoch": 0.01301, + "grad_norm": 0.4691822990462979, + "learning_rate": 0.003, + "loss": 4.3558, + "step": 1301 + }, + { + "epoch": 0.01302, + "grad_norm": 0.4140113973052622, + "learning_rate": 0.003, + "loss": 4.4006, + "step": 1302 + }, + { + "epoch": 0.01303, + "grad_norm": 0.3685109579455993, + "learning_rate": 0.003, + "loss": 4.3436, + "step": 1303 + }, + { + "epoch": 0.01304, + "grad_norm": 0.3760405201721622, + "learning_rate": 0.003, + "loss": 4.3696, + "step": 1304 + }, + { + "epoch": 0.01305, + "grad_norm": 0.3221665114274485, + "learning_rate": 0.003, + "loss": 4.3518, + "step": 1305 + }, + { + "epoch": 0.01306, + "grad_norm": 0.33492469868124675, + "learning_rate": 0.003, + "loss": 4.3452, + "step": 1306 + }, + { + "epoch": 0.01307, + "grad_norm": 0.33475611430641294, + "learning_rate": 0.003, + "loss": 4.3626, + "step": 1307 + }, + { + "epoch": 0.01308, + "grad_norm": 0.4027154015406206, + "learning_rate": 0.003, + "loss": 4.3385, + "step": 1308 + }, + { + "epoch": 0.01309, + "grad_norm": 0.5286332892700527, + "learning_rate": 0.003, + "loss": 4.3804, + "step": 1309 + }, + { + "epoch": 0.0131, + "grad_norm": 0.7090250449949596, + "learning_rate": 0.003, + "loss": 4.3359, + "step": 1310 + }, + { + "epoch": 0.01311, + "grad_norm": 0.9374592813554533, + "learning_rate": 0.003, + "loss": 4.3914, + "step": 1311 + }, + { + "epoch": 0.01312, + "grad_norm": 0.8386895366440477, + "learning_rate": 0.003, + "loss": 4.3696, + "step": 1312 + }, + { + "epoch": 0.01313, + "grad_norm": 0.7384206623774927, + "learning_rate": 0.003, + "loss": 4.3534, + "step": 1313 + }, + { + "epoch": 0.01314, + "grad_norm": 0.7139194720668496, + "learning_rate": 0.003, + "loss": 4.3527, + "step": 1314 + }, + { + "epoch": 0.01315, + "grad_norm": 0.7085227699461494, + "learning_rate": 0.003, + "loss": 4.3809, + "step": 1315 + }, + { + "epoch": 0.01316, + "grad_norm": 0.6924444519815, + "learning_rate": 0.003, + "loss": 4.3653, + "step": 1316 + }, + { + "epoch": 0.01317, + "grad_norm": 0.6387153931562799, + "learning_rate": 0.003, + "loss": 4.3902, + "step": 1317 + }, + { + "epoch": 0.01318, + "grad_norm": 0.6344112280159295, + "learning_rate": 0.003, + "loss": 4.3652, + "step": 1318 + }, + { + "epoch": 0.01319, + "grad_norm": 0.7270675767653099, + "learning_rate": 0.003, + "loss": 4.3841, + "step": 1319 + }, + { + "epoch": 0.0132, + "grad_norm": 0.6321319111156113, + "learning_rate": 0.003, + "loss": 4.3725, + "step": 1320 + }, + { + "epoch": 0.01321, + "grad_norm": 0.6141991678487796, + "learning_rate": 0.003, + "loss": 4.3508, + "step": 1321 + }, + { + "epoch": 0.01322, + "grad_norm": 0.5694545226818948, + "learning_rate": 0.003, + "loss": 4.3756, + "step": 1322 + }, + { + "epoch": 0.01323, + "grad_norm": 0.7445509945833578, + "learning_rate": 0.003, + "loss": 4.3766, + "step": 1323 + }, + { + "epoch": 0.01324, + "grad_norm": 0.7345275618591681, + "learning_rate": 0.003, + "loss": 4.3604, + "step": 1324 + }, + { + "epoch": 0.01325, + "grad_norm": 0.5731952380277242, + "learning_rate": 0.003, + "loss": 4.3595, + "step": 1325 + }, + { + "epoch": 0.01326, + "grad_norm": 0.49399828185062894, + "learning_rate": 0.003, + "loss": 4.3432, + "step": 1326 + }, + { + "epoch": 0.01327, + "grad_norm": 0.4625597253069682, + "learning_rate": 0.003, + "loss": 4.3586, + "step": 1327 + }, + { + "epoch": 0.01328, + "grad_norm": 0.46176548306971715, + "learning_rate": 0.003, + "loss": 4.3477, + "step": 1328 + }, + { + "epoch": 0.01329, + "grad_norm": 0.4120837821389968, + "learning_rate": 0.003, + "loss": 4.3099, + "step": 1329 + }, + { + "epoch": 0.0133, + "grad_norm": 0.3625953571207396, + "learning_rate": 0.003, + "loss": 4.3494, + "step": 1330 + }, + { + "epoch": 0.01331, + "grad_norm": 0.36184528613842726, + "learning_rate": 0.003, + "loss": 4.3351, + "step": 1331 + }, + { + "epoch": 0.01332, + "grad_norm": 0.3568024153641722, + "learning_rate": 0.003, + "loss": 4.3532, + "step": 1332 + }, + { + "epoch": 0.01333, + "grad_norm": 0.3204482353647099, + "learning_rate": 0.003, + "loss": 4.3252, + "step": 1333 + }, + { + "epoch": 0.01334, + "grad_norm": 0.29957411498207237, + "learning_rate": 0.003, + "loss": 4.3114, + "step": 1334 + }, + { + "epoch": 0.01335, + "grad_norm": 0.31171408266049466, + "learning_rate": 0.003, + "loss": 4.3334, + "step": 1335 + }, + { + "epoch": 0.01336, + "grad_norm": 0.28708995347069843, + "learning_rate": 0.003, + "loss": 4.3294, + "step": 1336 + }, + { + "epoch": 0.01337, + "grad_norm": 0.29278842241858566, + "learning_rate": 0.003, + "loss": 4.3434, + "step": 1337 + }, + { + "epoch": 0.01338, + "grad_norm": 0.2729723674299326, + "learning_rate": 0.003, + "loss": 4.3088, + "step": 1338 + }, + { + "epoch": 0.01339, + "grad_norm": 0.29431343213978217, + "learning_rate": 0.003, + "loss": 4.3327, + "step": 1339 + }, + { + "epoch": 0.0134, + "grad_norm": 0.3294175545018703, + "learning_rate": 0.003, + "loss": 4.3159, + "step": 1340 + }, + { + "epoch": 0.01341, + "grad_norm": 0.4346116539808444, + "learning_rate": 0.003, + "loss": 4.3329, + "step": 1341 + }, + { + "epoch": 0.01342, + "grad_norm": 0.6006528789210607, + "learning_rate": 0.003, + "loss": 4.3107, + "step": 1342 + }, + { + "epoch": 0.01343, + "grad_norm": 0.9468238356913852, + "learning_rate": 0.003, + "loss": 4.3323, + "step": 1343 + }, + { + "epoch": 0.01344, + "grad_norm": 0.9600917389412905, + "learning_rate": 0.003, + "loss": 4.3573, + "step": 1344 + }, + { + "epoch": 0.01345, + "grad_norm": 0.7454028532315747, + "learning_rate": 0.003, + "loss": 4.3824, + "step": 1345 + }, + { + "epoch": 0.01346, + "grad_norm": 0.9412385374275949, + "learning_rate": 0.003, + "loss": 4.3817, + "step": 1346 + }, + { + "epoch": 0.01347, + "grad_norm": 0.8019477836355356, + "learning_rate": 0.003, + "loss": 4.3621, + "step": 1347 + }, + { + "epoch": 0.01348, + "grad_norm": 0.822147627695476, + "learning_rate": 0.003, + "loss": 4.3672, + "step": 1348 + }, + { + "epoch": 0.01349, + "grad_norm": 0.7468015076519685, + "learning_rate": 0.003, + "loss": 4.3844, + "step": 1349 + }, + { + "epoch": 0.0135, + "grad_norm": 0.7150496437877271, + "learning_rate": 0.003, + "loss": 4.3769, + "step": 1350 + }, + { + "epoch": 0.01351, + "grad_norm": 0.6792275106065346, + "learning_rate": 0.003, + "loss": 4.4037, + "step": 1351 + }, + { + "epoch": 0.01352, + "grad_norm": 0.6749873536633729, + "learning_rate": 0.003, + "loss": 4.3795, + "step": 1352 + }, + { + "epoch": 0.01353, + "grad_norm": 0.5975751363464156, + "learning_rate": 0.003, + "loss": 4.3625, + "step": 1353 + }, + { + "epoch": 0.01354, + "grad_norm": 0.6367703716959885, + "learning_rate": 0.003, + "loss": 4.3473, + "step": 1354 + }, + { + "epoch": 0.01355, + "grad_norm": 0.6903253322513454, + "learning_rate": 0.003, + "loss": 4.3617, + "step": 1355 + }, + { + "epoch": 0.01356, + "grad_norm": 0.7830002665034361, + "learning_rate": 0.003, + "loss": 4.3589, + "step": 1356 + }, + { + "epoch": 0.01357, + "grad_norm": 0.7634637746675061, + "learning_rate": 0.003, + "loss": 4.351, + "step": 1357 + }, + { + "epoch": 0.01358, + "grad_norm": 0.6912113286037168, + "learning_rate": 0.003, + "loss": 4.3426, + "step": 1358 + }, + { + "epoch": 0.01359, + "grad_norm": 0.608100998568284, + "learning_rate": 0.003, + "loss": 4.3802, + "step": 1359 + }, + { + "epoch": 0.0136, + "grad_norm": 0.4174874313338161, + "learning_rate": 0.003, + "loss": 4.3375, + "step": 1360 + }, + { + "epoch": 0.01361, + "grad_norm": 0.4136844583981887, + "learning_rate": 0.003, + "loss": 4.3533, + "step": 1361 + }, + { + "epoch": 0.01362, + "grad_norm": 0.34986686555598134, + "learning_rate": 0.003, + "loss": 4.3597, + "step": 1362 + }, + { + "epoch": 0.01363, + "grad_norm": 0.39445259692219764, + "learning_rate": 0.003, + "loss": 4.3605, + "step": 1363 + }, + { + "epoch": 0.01364, + "grad_norm": 0.3392587016329843, + "learning_rate": 0.003, + "loss": 4.3508, + "step": 1364 + }, + { + "epoch": 0.01365, + "grad_norm": 0.3286929763598885, + "learning_rate": 0.003, + "loss": 4.3666, + "step": 1365 + }, + { + "epoch": 0.01366, + "grad_norm": 0.30698836061044843, + "learning_rate": 0.003, + "loss": 4.3194, + "step": 1366 + }, + { + "epoch": 0.01367, + "grad_norm": 0.3258356748399825, + "learning_rate": 0.003, + "loss": 4.3664, + "step": 1367 + }, + { + "epoch": 0.01368, + "grad_norm": 0.28974975652956814, + "learning_rate": 0.003, + "loss": 4.3364, + "step": 1368 + }, + { + "epoch": 0.01369, + "grad_norm": 0.3029711054161145, + "learning_rate": 0.003, + "loss": 4.3277, + "step": 1369 + }, + { + "epoch": 0.0137, + "grad_norm": 0.2864574506759695, + "learning_rate": 0.003, + "loss": 4.3508, + "step": 1370 + }, + { + "epoch": 0.01371, + "grad_norm": 0.2944593491964238, + "learning_rate": 0.003, + "loss": 4.3421, + "step": 1371 + }, + { + "epoch": 0.01372, + "grad_norm": 0.29733135265028415, + "learning_rate": 0.003, + "loss": 4.29, + "step": 1372 + }, + { + "epoch": 0.01373, + "grad_norm": 0.3030922885731935, + "learning_rate": 0.003, + "loss": 4.3287, + "step": 1373 + }, + { + "epoch": 0.01374, + "grad_norm": 0.3135661353258684, + "learning_rate": 0.003, + "loss": 4.3353, + "step": 1374 + }, + { + "epoch": 0.01375, + "grad_norm": 0.31552830475247895, + "learning_rate": 0.003, + "loss": 4.2954, + "step": 1375 + }, + { + "epoch": 0.01376, + "grad_norm": 0.3753685901400331, + "learning_rate": 0.003, + "loss": 4.3218, + "step": 1376 + }, + { + "epoch": 0.01377, + "grad_norm": 0.4931158688793232, + "learning_rate": 0.003, + "loss": 4.3247, + "step": 1377 + }, + { + "epoch": 0.01378, + "grad_norm": 0.6194459766224392, + "learning_rate": 0.003, + "loss": 4.3586, + "step": 1378 + }, + { + "epoch": 0.01379, + "grad_norm": 0.7157871954469385, + "learning_rate": 0.003, + "loss": 4.3344, + "step": 1379 + }, + { + "epoch": 0.0138, + "grad_norm": 0.7778989267777976, + "learning_rate": 0.003, + "loss": 4.3379, + "step": 1380 + }, + { + "epoch": 0.01381, + "grad_norm": 0.7540005493939272, + "learning_rate": 0.003, + "loss": 4.3478, + "step": 1381 + }, + { + "epoch": 0.01382, + "grad_norm": 0.6552744600197745, + "learning_rate": 0.003, + "loss": 4.3434, + "step": 1382 + }, + { + "epoch": 0.01383, + "grad_norm": 0.5244295500211882, + "learning_rate": 0.003, + "loss": 4.332, + "step": 1383 + }, + { + "epoch": 0.01384, + "grad_norm": 0.5199048618667436, + "learning_rate": 0.003, + "loss": 4.3604, + "step": 1384 + }, + { + "epoch": 0.01385, + "grad_norm": 0.5115879970370135, + "learning_rate": 0.003, + "loss": 4.3484, + "step": 1385 + }, + { + "epoch": 0.01386, + "grad_norm": 0.5337761524536188, + "learning_rate": 0.003, + "loss": 4.3229, + "step": 1386 + }, + { + "epoch": 0.01387, + "grad_norm": 0.49971027233062854, + "learning_rate": 0.003, + "loss": 4.3141, + "step": 1387 + }, + { + "epoch": 0.01388, + "grad_norm": 0.45570239975200477, + "learning_rate": 0.003, + "loss": 4.3501, + "step": 1388 + }, + { + "epoch": 0.01389, + "grad_norm": 0.4952533360995893, + "learning_rate": 0.003, + "loss": 4.329, + "step": 1389 + }, + { + "epoch": 0.0139, + "grad_norm": 0.5836752219457788, + "learning_rate": 0.003, + "loss": 4.3354, + "step": 1390 + }, + { + "epoch": 0.01391, + "grad_norm": 0.661197601877714, + "learning_rate": 0.003, + "loss": 4.3388, + "step": 1391 + }, + { + "epoch": 0.01392, + "grad_norm": 0.7580066821140803, + "learning_rate": 0.003, + "loss": 4.3629, + "step": 1392 + }, + { + "epoch": 0.01393, + "grad_norm": 0.864097109773997, + "learning_rate": 0.003, + "loss": 4.3351, + "step": 1393 + }, + { + "epoch": 0.01394, + "grad_norm": 0.8137871207398635, + "learning_rate": 0.003, + "loss": 4.3329, + "step": 1394 + }, + { + "epoch": 0.01395, + "grad_norm": 0.6452659589342782, + "learning_rate": 0.003, + "loss": 4.3528, + "step": 1395 + }, + { + "epoch": 0.01396, + "grad_norm": 0.7169962932582273, + "learning_rate": 0.003, + "loss": 4.3526, + "step": 1396 + }, + { + "epoch": 0.01397, + "grad_norm": 0.7722092445904787, + "learning_rate": 0.003, + "loss": 4.37, + "step": 1397 + }, + { + "epoch": 0.01398, + "grad_norm": 0.9201404272023904, + "learning_rate": 0.003, + "loss": 4.3584, + "step": 1398 + }, + { + "epoch": 0.01399, + "grad_norm": 0.8604376144856999, + "learning_rate": 0.003, + "loss": 4.3863, + "step": 1399 + }, + { + "epoch": 0.014, + "grad_norm": 0.7356178947310503, + "learning_rate": 0.003, + "loss": 4.3371, + "step": 1400 + }, + { + "epoch": 0.01401, + "grad_norm": 0.6698615745523159, + "learning_rate": 0.003, + "loss": 4.3548, + "step": 1401 + }, + { + "epoch": 0.01402, + "grad_norm": 0.678651105292387, + "learning_rate": 0.003, + "loss": 4.367, + "step": 1402 + }, + { + "epoch": 0.01403, + "grad_norm": 0.7164501675372886, + "learning_rate": 0.003, + "loss": 4.3428, + "step": 1403 + }, + { + "epoch": 0.01404, + "grad_norm": 0.7780771367935839, + "learning_rate": 0.003, + "loss": 4.3706, + "step": 1404 + }, + { + "epoch": 0.01405, + "grad_norm": 0.8097299555256038, + "learning_rate": 0.003, + "loss": 4.3921, + "step": 1405 + }, + { + "epoch": 0.01406, + "grad_norm": 0.8397836379510005, + "learning_rate": 0.003, + "loss": 4.3748, + "step": 1406 + }, + { + "epoch": 0.01407, + "grad_norm": 0.798434347116743, + "learning_rate": 0.003, + "loss": 4.3886, + "step": 1407 + }, + { + "epoch": 0.01408, + "grad_norm": 0.690162960674035, + "learning_rate": 0.003, + "loss": 4.3438, + "step": 1408 + }, + { + "epoch": 0.01409, + "grad_norm": 0.7158182494367376, + "learning_rate": 0.003, + "loss": 4.3755, + "step": 1409 + }, + { + "epoch": 0.0141, + "grad_norm": 0.6626220406570043, + "learning_rate": 0.003, + "loss": 4.3521, + "step": 1410 + }, + { + "epoch": 0.01411, + "grad_norm": 0.6080932877854873, + "learning_rate": 0.003, + "loss": 4.3227, + "step": 1411 + }, + { + "epoch": 0.01412, + "grad_norm": 0.6342264759843587, + "learning_rate": 0.003, + "loss": 4.3642, + "step": 1412 + }, + { + "epoch": 0.01413, + "grad_norm": 0.5857768731552475, + "learning_rate": 0.003, + "loss": 4.3559, + "step": 1413 + }, + { + "epoch": 0.01414, + "grad_norm": 0.575683339939827, + "learning_rate": 0.003, + "loss": 4.336, + "step": 1414 + }, + { + "epoch": 0.01415, + "grad_norm": 0.5478713782500327, + "learning_rate": 0.003, + "loss": 4.3284, + "step": 1415 + }, + { + "epoch": 0.01416, + "grad_norm": 0.48495351731286235, + "learning_rate": 0.003, + "loss": 4.3497, + "step": 1416 + }, + { + "epoch": 0.01417, + "grad_norm": 0.5582320312672604, + "learning_rate": 0.003, + "loss": 4.3419, + "step": 1417 + }, + { + "epoch": 0.01418, + "grad_norm": 0.6202372623477388, + "learning_rate": 0.003, + "loss": 4.3622, + "step": 1418 + }, + { + "epoch": 0.01419, + "grad_norm": 0.6521670079720687, + "learning_rate": 0.003, + "loss": 4.3534, + "step": 1419 + }, + { + "epoch": 0.0142, + "grad_norm": 0.5549508977810897, + "learning_rate": 0.003, + "loss": 4.3578, + "step": 1420 + }, + { + "epoch": 0.01421, + "grad_norm": 0.45613666396542774, + "learning_rate": 0.003, + "loss": 4.3197, + "step": 1421 + }, + { + "epoch": 0.01422, + "grad_norm": 0.445372260007137, + "learning_rate": 0.003, + "loss": 4.322, + "step": 1422 + }, + { + "epoch": 0.01423, + "grad_norm": 0.38402591989219925, + "learning_rate": 0.003, + "loss": 4.3246, + "step": 1423 + }, + { + "epoch": 0.01424, + "grad_norm": 0.38806384974852803, + "learning_rate": 0.003, + "loss": 4.3401, + "step": 1424 + }, + { + "epoch": 0.01425, + "grad_norm": 0.4132316355770368, + "learning_rate": 0.003, + "loss": 4.3436, + "step": 1425 + }, + { + "epoch": 0.01426, + "grad_norm": 0.3529318435487905, + "learning_rate": 0.003, + "loss": 4.3165, + "step": 1426 + }, + { + "epoch": 0.01427, + "grad_norm": 0.3824674454914564, + "learning_rate": 0.003, + "loss": 4.3137, + "step": 1427 + }, + { + "epoch": 0.01428, + "grad_norm": 0.40903598720809137, + "learning_rate": 0.003, + "loss": 4.3085, + "step": 1428 + }, + { + "epoch": 0.01429, + "grad_norm": 0.48492866215569996, + "learning_rate": 0.003, + "loss": 4.3211, + "step": 1429 + }, + { + "epoch": 0.0143, + "grad_norm": 0.5657427651213296, + "learning_rate": 0.003, + "loss": 4.3342, + "step": 1430 + }, + { + "epoch": 0.01431, + "grad_norm": 0.5675857894892379, + "learning_rate": 0.003, + "loss": 4.3246, + "step": 1431 + }, + { + "epoch": 0.01432, + "grad_norm": 0.5138100030227308, + "learning_rate": 0.003, + "loss": 4.3123, + "step": 1432 + }, + { + "epoch": 0.01433, + "grad_norm": 0.3834195074788409, + "learning_rate": 0.003, + "loss": 4.3046, + "step": 1433 + }, + { + "epoch": 0.01434, + "grad_norm": 0.380142493390664, + "learning_rate": 0.003, + "loss": 4.2898, + "step": 1434 + }, + { + "epoch": 0.01435, + "grad_norm": 0.40030955161206494, + "learning_rate": 0.003, + "loss": 4.2992, + "step": 1435 + }, + { + "epoch": 0.01436, + "grad_norm": 0.3954780968032791, + "learning_rate": 0.003, + "loss": 4.3173, + "step": 1436 + }, + { + "epoch": 0.01437, + "grad_norm": 0.3507385052853375, + "learning_rate": 0.003, + "loss": 4.3285, + "step": 1437 + }, + { + "epoch": 0.01438, + "grad_norm": 0.37418173745501515, + "learning_rate": 0.003, + "loss": 4.3228, + "step": 1438 + }, + { + "epoch": 0.01439, + "grad_norm": 0.3823240257736278, + "learning_rate": 0.003, + "loss": 4.2864, + "step": 1439 + }, + { + "epoch": 0.0144, + "grad_norm": 0.4883909418304791, + "learning_rate": 0.003, + "loss": 4.3011, + "step": 1440 + }, + { + "epoch": 0.01441, + "grad_norm": 0.5551175791026393, + "learning_rate": 0.003, + "loss": 4.3126, + "step": 1441 + }, + { + "epoch": 0.01442, + "grad_norm": 0.5566257513382789, + "learning_rate": 0.003, + "loss": 4.2964, + "step": 1442 + }, + { + "epoch": 0.01443, + "grad_norm": 0.5444967253409229, + "learning_rate": 0.003, + "loss": 4.3586, + "step": 1443 + }, + { + "epoch": 0.01444, + "grad_norm": 0.5868401236318755, + "learning_rate": 0.003, + "loss": 4.3246, + "step": 1444 + }, + { + "epoch": 0.01445, + "grad_norm": 0.49247571907636584, + "learning_rate": 0.003, + "loss": 4.282, + "step": 1445 + }, + { + "epoch": 0.01446, + "grad_norm": 0.4611996904427917, + "learning_rate": 0.003, + "loss": 4.2968, + "step": 1446 + }, + { + "epoch": 0.01447, + "grad_norm": 0.4704096683460519, + "learning_rate": 0.003, + "loss": 4.3067, + "step": 1447 + }, + { + "epoch": 0.01448, + "grad_norm": 0.39306824337135715, + "learning_rate": 0.003, + "loss": 4.3134, + "step": 1448 + }, + { + "epoch": 0.01449, + "grad_norm": 0.41700654092261286, + "learning_rate": 0.003, + "loss": 4.3094, + "step": 1449 + }, + { + "epoch": 0.0145, + "grad_norm": 0.407843055117697, + "learning_rate": 0.003, + "loss": 4.3225, + "step": 1450 + }, + { + "epoch": 0.01451, + "grad_norm": 0.46619408673841983, + "learning_rate": 0.003, + "loss": 4.304, + "step": 1451 + }, + { + "epoch": 0.01452, + "grad_norm": 0.5845723252800918, + "learning_rate": 0.003, + "loss": 4.3078, + "step": 1452 + }, + { + "epoch": 0.01453, + "grad_norm": 0.6930673151633832, + "learning_rate": 0.003, + "loss": 4.3299, + "step": 1453 + }, + { + "epoch": 0.01454, + "grad_norm": 0.7096402728772245, + "learning_rate": 0.003, + "loss": 4.2995, + "step": 1454 + }, + { + "epoch": 0.01455, + "grad_norm": 0.6610327702633573, + "learning_rate": 0.003, + "loss": 4.3327, + "step": 1455 + }, + { + "epoch": 0.01456, + "grad_norm": 0.6903492208096148, + "learning_rate": 0.003, + "loss": 4.3118, + "step": 1456 + }, + { + "epoch": 0.01457, + "grad_norm": 0.5695969137152378, + "learning_rate": 0.003, + "loss": 4.2978, + "step": 1457 + }, + { + "epoch": 0.01458, + "grad_norm": 0.5248689907637836, + "learning_rate": 0.003, + "loss": 4.2909, + "step": 1458 + }, + { + "epoch": 0.01459, + "grad_norm": 0.49793282629814667, + "learning_rate": 0.003, + "loss": 4.318, + "step": 1459 + }, + { + "epoch": 0.0146, + "grad_norm": 0.5259290959891172, + "learning_rate": 0.003, + "loss": 4.3252, + "step": 1460 + }, + { + "epoch": 0.01461, + "grad_norm": 0.5423165316350705, + "learning_rate": 0.003, + "loss": 4.3026, + "step": 1461 + }, + { + "epoch": 0.01462, + "grad_norm": 0.5122102135827509, + "learning_rate": 0.003, + "loss": 4.3083, + "step": 1462 + }, + { + "epoch": 0.01463, + "grad_norm": 0.48806792365665164, + "learning_rate": 0.003, + "loss": 4.304, + "step": 1463 + }, + { + "epoch": 0.01464, + "grad_norm": 0.4826165334345253, + "learning_rate": 0.003, + "loss": 4.2803, + "step": 1464 + }, + { + "epoch": 0.01465, + "grad_norm": 0.48806303528119876, + "learning_rate": 0.003, + "loss": 4.325, + "step": 1465 + }, + { + "epoch": 0.01466, + "grad_norm": 0.5288880864983577, + "learning_rate": 0.003, + "loss": 4.2948, + "step": 1466 + }, + { + "epoch": 0.01467, + "grad_norm": 0.6678963880365116, + "learning_rate": 0.003, + "loss": 4.3367, + "step": 1467 + }, + { + "epoch": 0.01468, + "grad_norm": 0.8507141482558338, + "learning_rate": 0.003, + "loss": 4.3259, + "step": 1468 + }, + { + "epoch": 0.01469, + "grad_norm": 0.9674104680499097, + "learning_rate": 0.003, + "loss": 4.3427, + "step": 1469 + }, + { + "epoch": 0.0147, + "grad_norm": 0.9577096701541407, + "learning_rate": 0.003, + "loss": 4.3502, + "step": 1470 + }, + { + "epoch": 0.01471, + "grad_norm": 0.9021599056855796, + "learning_rate": 0.003, + "loss": 4.3567, + "step": 1471 + }, + { + "epoch": 0.01472, + "grad_norm": 0.8933064062959534, + "learning_rate": 0.003, + "loss": 4.3477, + "step": 1472 + }, + { + "epoch": 0.01473, + "grad_norm": 0.8969620681941299, + "learning_rate": 0.003, + "loss": 4.3708, + "step": 1473 + }, + { + "epoch": 0.01474, + "grad_norm": 0.6674847167774206, + "learning_rate": 0.003, + "loss": 4.3402, + "step": 1474 + }, + { + "epoch": 0.01475, + "grad_norm": 0.7252535747045, + "learning_rate": 0.003, + "loss": 4.3675, + "step": 1475 + }, + { + "epoch": 0.01476, + "grad_norm": 0.7222358160132806, + "learning_rate": 0.003, + "loss": 4.3503, + "step": 1476 + }, + { + "epoch": 0.01477, + "grad_norm": 0.837925142735814, + "learning_rate": 0.003, + "loss": 4.3614, + "step": 1477 + }, + { + "epoch": 0.01478, + "grad_norm": 0.8571430973709657, + "learning_rate": 0.003, + "loss": 4.3541, + "step": 1478 + }, + { + "epoch": 0.01479, + "grad_norm": 0.7748951970932865, + "learning_rate": 0.003, + "loss": 4.3832, + "step": 1479 + }, + { + "epoch": 0.0148, + "grad_norm": 0.5769653553075942, + "learning_rate": 0.003, + "loss": 4.3406, + "step": 1480 + }, + { + "epoch": 0.01481, + "grad_norm": 0.5617397637544572, + "learning_rate": 0.003, + "loss": 4.343, + "step": 1481 + }, + { + "epoch": 0.01482, + "grad_norm": 0.494589006608917, + "learning_rate": 0.003, + "loss": 4.3215, + "step": 1482 + }, + { + "epoch": 0.01483, + "grad_norm": 0.5358352358514579, + "learning_rate": 0.003, + "loss": 4.3218, + "step": 1483 + }, + { + "epoch": 0.01484, + "grad_norm": 0.4638148892097882, + "learning_rate": 0.003, + "loss": 4.3656, + "step": 1484 + }, + { + "epoch": 0.01485, + "grad_norm": 0.5225092823087141, + "learning_rate": 0.003, + "loss": 4.3308, + "step": 1485 + }, + { + "epoch": 0.01486, + "grad_norm": 0.5536760113665742, + "learning_rate": 0.003, + "loss": 4.3262, + "step": 1486 + }, + { + "epoch": 0.01487, + "grad_norm": 0.5280609352965188, + "learning_rate": 0.003, + "loss": 4.3256, + "step": 1487 + }, + { + "epoch": 0.01488, + "grad_norm": 0.6169285208705055, + "learning_rate": 0.003, + "loss": 4.3379, + "step": 1488 + }, + { + "epoch": 0.01489, + "grad_norm": 0.7420878936102314, + "learning_rate": 0.003, + "loss": 4.3425, + "step": 1489 + }, + { + "epoch": 0.0149, + "grad_norm": 0.7781493014062594, + "learning_rate": 0.003, + "loss": 4.3387, + "step": 1490 + }, + { + "epoch": 0.01491, + "grad_norm": 0.6235654472208051, + "learning_rate": 0.003, + "loss": 4.3459, + "step": 1491 + }, + { + "epoch": 0.01492, + "grad_norm": 0.5493424470537548, + "learning_rate": 0.003, + "loss": 4.3048, + "step": 1492 + }, + { + "epoch": 0.01493, + "grad_norm": 0.5353895236188659, + "learning_rate": 0.003, + "loss": 4.3336, + "step": 1493 + }, + { + "epoch": 0.01494, + "grad_norm": 0.4821593967850022, + "learning_rate": 0.003, + "loss": 4.3113, + "step": 1494 + }, + { + "epoch": 0.01495, + "grad_norm": 0.4536978313935068, + "learning_rate": 0.003, + "loss": 4.3201, + "step": 1495 + }, + { + "epoch": 0.01496, + "grad_norm": 0.4449517853772135, + "learning_rate": 0.003, + "loss": 4.294, + "step": 1496 + }, + { + "epoch": 0.01497, + "grad_norm": 0.39867192565794446, + "learning_rate": 0.003, + "loss": 4.2656, + "step": 1497 + }, + { + "epoch": 0.01498, + "grad_norm": 0.3910710132293917, + "learning_rate": 0.003, + "loss": 4.3202, + "step": 1498 + }, + { + "epoch": 0.01499, + "grad_norm": 0.368047206254813, + "learning_rate": 0.003, + "loss": 4.339, + "step": 1499 + }, + { + "epoch": 0.015, + "grad_norm": 0.34050808541322203, + "learning_rate": 0.003, + "loss": 4.2902, + "step": 1500 + }, + { + "epoch": 0.01501, + "grad_norm": 0.36125568650846523, + "learning_rate": 0.003, + "loss": 4.2886, + "step": 1501 + }, + { + "epoch": 0.01502, + "grad_norm": 0.3250619223936781, + "learning_rate": 0.003, + "loss": 4.3299, + "step": 1502 + }, + { + "epoch": 0.01503, + "grad_norm": 0.362004630486072, + "learning_rate": 0.003, + "loss": 4.3091, + "step": 1503 + }, + { + "epoch": 0.01504, + "grad_norm": 0.42640653384457183, + "learning_rate": 0.003, + "loss": 4.3092, + "step": 1504 + }, + { + "epoch": 0.01505, + "grad_norm": 0.4655634074328374, + "learning_rate": 0.003, + "loss": 4.2935, + "step": 1505 + }, + { + "epoch": 0.01506, + "grad_norm": 0.4830681135087923, + "learning_rate": 0.003, + "loss": 4.3131, + "step": 1506 + }, + { + "epoch": 0.01507, + "grad_norm": 0.45072861631675815, + "learning_rate": 0.003, + "loss": 4.3252, + "step": 1507 + }, + { + "epoch": 0.01508, + "grad_norm": 0.41884675318056874, + "learning_rate": 0.003, + "loss": 4.2996, + "step": 1508 + }, + { + "epoch": 0.01509, + "grad_norm": 0.45582678472670524, + "learning_rate": 0.003, + "loss": 4.32, + "step": 1509 + }, + { + "epoch": 0.0151, + "grad_norm": 0.4555994257034133, + "learning_rate": 0.003, + "loss": 4.2923, + "step": 1510 + }, + { + "epoch": 0.01511, + "grad_norm": 0.47932862889061606, + "learning_rate": 0.003, + "loss": 4.2807, + "step": 1511 + }, + { + "epoch": 0.01512, + "grad_norm": 0.4824629582897255, + "learning_rate": 0.003, + "loss": 4.2928, + "step": 1512 + }, + { + "epoch": 0.01513, + "grad_norm": 0.4551520372558624, + "learning_rate": 0.003, + "loss": 4.2958, + "step": 1513 + }, + { + "epoch": 0.01514, + "grad_norm": 0.40280658680118386, + "learning_rate": 0.003, + "loss": 4.3004, + "step": 1514 + }, + { + "epoch": 0.01515, + "grad_norm": 0.4291367693804096, + "learning_rate": 0.003, + "loss": 4.2938, + "step": 1515 + }, + { + "epoch": 0.01516, + "grad_norm": 0.5124558841404536, + "learning_rate": 0.003, + "loss": 4.3014, + "step": 1516 + }, + { + "epoch": 0.01517, + "grad_norm": 0.662211893130953, + "learning_rate": 0.003, + "loss": 4.2591, + "step": 1517 + }, + { + "epoch": 0.01518, + "grad_norm": 0.7783947565165081, + "learning_rate": 0.003, + "loss": 4.3029, + "step": 1518 + }, + { + "epoch": 0.01519, + "grad_norm": 0.7115931895318321, + "learning_rate": 0.003, + "loss": 4.3425, + "step": 1519 + }, + { + "epoch": 0.0152, + "grad_norm": 0.727509066973132, + "learning_rate": 0.003, + "loss": 4.2935, + "step": 1520 + }, + { + "epoch": 0.01521, + "grad_norm": 0.8200160562394482, + "learning_rate": 0.003, + "loss": 4.3183, + "step": 1521 + }, + { + "epoch": 0.01522, + "grad_norm": 0.785335657111215, + "learning_rate": 0.003, + "loss": 4.3154, + "step": 1522 + }, + { + "epoch": 0.01523, + "grad_norm": 0.6269328513970822, + "learning_rate": 0.003, + "loss": 4.3029, + "step": 1523 + }, + { + "epoch": 0.01524, + "grad_norm": 0.6280834911768081, + "learning_rate": 0.003, + "loss": 4.3392, + "step": 1524 + }, + { + "epoch": 0.01525, + "grad_norm": 0.562699648791423, + "learning_rate": 0.003, + "loss": 4.3367, + "step": 1525 + }, + { + "epoch": 0.01526, + "grad_norm": 0.6402790783462683, + "learning_rate": 0.003, + "loss": 4.3277, + "step": 1526 + }, + { + "epoch": 0.01527, + "grad_norm": 0.6038705757661785, + "learning_rate": 0.003, + "loss": 4.2979, + "step": 1527 + }, + { + "epoch": 0.01528, + "grad_norm": 0.636946924703658, + "learning_rate": 0.003, + "loss": 4.3174, + "step": 1528 + }, + { + "epoch": 0.01529, + "grad_norm": 0.6521902920068134, + "learning_rate": 0.003, + "loss": 4.3136, + "step": 1529 + }, + { + "epoch": 0.0153, + "grad_norm": 0.7380737016000076, + "learning_rate": 0.003, + "loss": 4.3247, + "step": 1530 + }, + { + "epoch": 0.01531, + "grad_norm": 0.7548829272902333, + "learning_rate": 0.003, + "loss": 4.3243, + "step": 1531 + }, + { + "epoch": 0.01532, + "grad_norm": 0.7137520123617993, + "learning_rate": 0.003, + "loss": 4.3014, + "step": 1532 + }, + { + "epoch": 0.01533, + "grad_norm": 0.7523921063923493, + "learning_rate": 0.003, + "loss": 4.3327, + "step": 1533 + }, + { + "epoch": 0.01534, + "grad_norm": 0.7506503205085225, + "learning_rate": 0.003, + "loss": 4.3336, + "step": 1534 + }, + { + "epoch": 0.01535, + "grad_norm": 0.7303716243088675, + "learning_rate": 0.003, + "loss": 4.3418, + "step": 1535 + }, + { + "epoch": 0.01536, + "grad_norm": 0.6839262408941584, + "learning_rate": 0.003, + "loss": 4.32, + "step": 1536 + }, + { + "epoch": 0.01537, + "grad_norm": 0.7109733936082817, + "learning_rate": 0.003, + "loss": 4.3199, + "step": 1537 + }, + { + "epoch": 0.01538, + "grad_norm": 0.7061208974263588, + "learning_rate": 0.003, + "loss": 4.296, + "step": 1538 + }, + { + "epoch": 0.01539, + "grad_norm": 0.6986672189571356, + "learning_rate": 0.003, + "loss": 4.3349, + "step": 1539 + }, + { + "epoch": 0.0154, + "grad_norm": 0.6534468982180698, + "learning_rate": 0.003, + "loss": 4.333, + "step": 1540 + }, + { + "epoch": 0.01541, + "grad_norm": 0.49825734343698913, + "learning_rate": 0.003, + "loss": 4.3068, + "step": 1541 + }, + { + "epoch": 0.01542, + "grad_norm": 0.44352017421476997, + "learning_rate": 0.003, + "loss": 4.3332, + "step": 1542 + }, + { + "epoch": 0.01543, + "grad_norm": 0.38211515055441725, + "learning_rate": 0.003, + "loss": 4.3129, + "step": 1543 + }, + { + "epoch": 0.01544, + "grad_norm": 0.38911843319302664, + "learning_rate": 0.003, + "loss": 4.2994, + "step": 1544 + }, + { + "epoch": 0.01545, + "grad_norm": 0.3624309615071896, + "learning_rate": 0.003, + "loss": 4.3625, + "step": 1545 + }, + { + "epoch": 0.01546, + "grad_norm": 0.4569500368392657, + "learning_rate": 0.003, + "loss": 4.3313, + "step": 1546 + }, + { + "epoch": 0.01547, + "grad_norm": 0.5377623135091468, + "learning_rate": 0.003, + "loss": 4.3355, + "step": 1547 + }, + { + "epoch": 0.01548, + "grad_norm": 0.6297787769238979, + "learning_rate": 0.003, + "loss": 4.3034, + "step": 1548 + }, + { + "epoch": 0.01549, + "grad_norm": 0.644439142847034, + "learning_rate": 0.003, + "loss": 4.2897, + "step": 1549 + }, + { + "epoch": 0.0155, + "grad_norm": 0.618142851349942, + "learning_rate": 0.003, + "loss": 4.3109, + "step": 1550 + }, + { + "epoch": 0.01551, + "grad_norm": 0.5448565014455535, + "learning_rate": 0.003, + "loss": 4.2869, + "step": 1551 + }, + { + "epoch": 0.01552, + "grad_norm": 0.5723351701615428, + "learning_rate": 0.003, + "loss": 4.3071, + "step": 1552 + }, + { + "epoch": 0.01553, + "grad_norm": 0.6022810466873098, + "learning_rate": 0.003, + "loss": 4.3328, + "step": 1553 + }, + { + "epoch": 0.01554, + "grad_norm": 0.6989931225913981, + "learning_rate": 0.003, + "loss": 4.324, + "step": 1554 + }, + { + "epoch": 0.01555, + "grad_norm": 0.6580206580270094, + "learning_rate": 0.003, + "loss": 4.2931, + "step": 1555 + }, + { + "epoch": 0.01556, + "grad_norm": 0.6533983591917718, + "learning_rate": 0.003, + "loss": 4.3308, + "step": 1556 + }, + { + "epoch": 0.01557, + "grad_norm": 0.6295945004011335, + "learning_rate": 0.003, + "loss": 4.3036, + "step": 1557 + }, + { + "epoch": 0.01558, + "grad_norm": 0.5371854889087798, + "learning_rate": 0.003, + "loss": 4.2914, + "step": 1558 + }, + { + "epoch": 0.01559, + "grad_norm": 0.4632942102113827, + "learning_rate": 0.003, + "loss": 4.326, + "step": 1559 + }, + { + "epoch": 0.0156, + "grad_norm": 0.5051005361340883, + "learning_rate": 0.003, + "loss": 4.3169, + "step": 1560 + }, + { + "epoch": 0.01561, + "grad_norm": 0.5283819882561609, + "learning_rate": 0.003, + "loss": 4.299, + "step": 1561 + }, + { + "epoch": 0.01562, + "grad_norm": 0.591562612461458, + "learning_rate": 0.003, + "loss": 4.2958, + "step": 1562 + }, + { + "epoch": 0.01563, + "grad_norm": 0.6357918082942466, + "learning_rate": 0.003, + "loss": 4.3041, + "step": 1563 + }, + { + "epoch": 0.01564, + "grad_norm": 0.6669974739393074, + "learning_rate": 0.003, + "loss": 4.3336, + "step": 1564 + }, + { + "epoch": 0.01565, + "grad_norm": 0.691137766433848, + "learning_rate": 0.003, + "loss": 4.3141, + "step": 1565 + }, + { + "epoch": 0.01566, + "grad_norm": 0.5930174244490458, + "learning_rate": 0.003, + "loss": 4.3298, + "step": 1566 + }, + { + "epoch": 0.01567, + "grad_norm": 0.551100674489701, + "learning_rate": 0.003, + "loss": 4.319, + "step": 1567 + }, + { + "epoch": 0.01568, + "grad_norm": 0.5879757029751648, + "learning_rate": 0.003, + "loss": 4.3039, + "step": 1568 + }, + { + "epoch": 0.01569, + "grad_norm": 0.5302512625540108, + "learning_rate": 0.003, + "loss": 4.3069, + "step": 1569 + }, + { + "epoch": 0.0157, + "grad_norm": 0.49569383773361164, + "learning_rate": 0.003, + "loss": 4.3233, + "step": 1570 + }, + { + "epoch": 0.01571, + "grad_norm": 0.4733165510784743, + "learning_rate": 0.003, + "loss": 4.3065, + "step": 1571 + }, + { + "epoch": 0.01572, + "grad_norm": 0.4449250863339169, + "learning_rate": 0.003, + "loss": 4.3135, + "step": 1572 + }, + { + "epoch": 0.01573, + "grad_norm": 0.3929931575857813, + "learning_rate": 0.003, + "loss": 4.2941, + "step": 1573 + }, + { + "epoch": 0.01574, + "grad_norm": 0.4142053233129594, + "learning_rate": 0.003, + "loss": 4.3098, + "step": 1574 + }, + { + "epoch": 0.01575, + "grad_norm": 0.3709436905014968, + "learning_rate": 0.003, + "loss": 4.2702, + "step": 1575 + }, + { + "epoch": 0.01576, + "grad_norm": 0.36495283580117344, + "learning_rate": 0.003, + "loss": 4.2914, + "step": 1576 + }, + { + "epoch": 0.01577, + "grad_norm": 0.3915297767332071, + "learning_rate": 0.003, + "loss": 4.2525, + "step": 1577 + }, + { + "epoch": 0.01578, + "grad_norm": 0.5001638890424236, + "learning_rate": 0.003, + "loss": 4.284, + "step": 1578 + }, + { + "epoch": 0.01579, + "grad_norm": 0.6179833768395876, + "learning_rate": 0.003, + "loss": 4.2911, + "step": 1579 + }, + { + "epoch": 0.0158, + "grad_norm": 0.6342638472068223, + "learning_rate": 0.003, + "loss": 4.299, + "step": 1580 + }, + { + "epoch": 0.01581, + "grad_norm": 0.5432322505669018, + "learning_rate": 0.003, + "loss": 4.3062, + "step": 1581 + }, + { + "epoch": 0.01582, + "grad_norm": 0.4889222240722665, + "learning_rate": 0.003, + "loss": 4.2846, + "step": 1582 + }, + { + "epoch": 0.01583, + "grad_norm": 0.5515618698965413, + "learning_rate": 0.003, + "loss": 4.274, + "step": 1583 + }, + { + "epoch": 0.01584, + "grad_norm": 0.549844036531772, + "learning_rate": 0.003, + "loss": 4.3079, + "step": 1584 + }, + { + "epoch": 0.01585, + "grad_norm": 0.48716680666102685, + "learning_rate": 0.003, + "loss": 4.2981, + "step": 1585 + }, + { + "epoch": 0.01586, + "grad_norm": 0.623103221118658, + "learning_rate": 0.003, + "loss": 4.2774, + "step": 1586 + }, + { + "epoch": 0.01587, + "grad_norm": 0.7984823726854448, + "learning_rate": 0.003, + "loss": 4.3077, + "step": 1587 + }, + { + "epoch": 0.01588, + "grad_norm": 0.7627654453676783, + "learning_rate": 0.003, + "loss": 4.3371, + "step": 1588 + }, + { + "epoch": 0.01589, + "grad_norm": 0.7599095947865857, + "learning_rate": 0.003, + "loss": 4.3312, + "step": 1589 + }, + { + "epoch": 0.0159, + "grad_norm": 0.8159908089496294, + "learning_rate": 0.003, + "loss": 4.33, + "step": 1590 + }, + { + "epoch": 0.01591, + "grad_norm": 0.8803872581529493, + "learning_rate": 0.003, + "loss": 4.3476, + "step": 1591 + }, + { + "epoch": 0.01592, + "grad_norm": 0.9820230872746389, + "learning_rate": 0.003, + "loss": 4.349, + "step": 1592 + }, + { + "epoch": 0.01593, + "grad_norm": 0.9673129749259365, + "learning_rate": 0.003, + "loss": 4.352, + "step": 1593 + }, + { + "epoch": 0.01594, + "grad_norm": 0.9293561890551637, + "learning_rate": 0.003, + "loss": 4.3487, + "step": 1594 + }, + { + "epoch": 0.01595, + "grad_norm": 0.9561454087951702, + "learning_rate": 0.003, + "loss": 4.3811, + "step": 1595 + }, + { + "epoch": 0.01596, + "grad_norm": 1.0372746249351796, + "learning_rate": 0.003, + "loss": 4.3847, + "step": 1596 + }, + { + "epoch": 0.01597, + "grad_norm": 0.9040299607313536, + "learning_rate": 0.003, + "loss": 4.3427, + "step": 1597 + }, + { + "epoch": 0.01598, + "grad_norm": 1.1389328260746334, + "learning_rate": 0.003, + "loss": 4.3536, + "step": 1598 + }, + { + "epoch": 0.01599, + "grad_norm": 1.097139764937509, + "learning_rate": 0.003, + "loss": 4.3856, + "step": 1599 + }, + { + "epoch": 0.016, + "grad_norm": 0.9605869668937348, + "learning_rate": 0.003, + "loss": 4.3534, + "step": 1600 + }, + { + "epoch": 0.01601, + "grad_norm": 0.7696982537009223, + "learning_rate": 0.003, + "loss": 4.3899, + "step": 1601 + }, + { + "epoch": 0.01602, + "grad_norm": 0.7083103903580285, + "learning_rate": 0.003, + "loss": 4.3647, + "step": 1602 + }, + { + "epoch": 0.01603, + "grad_norm": 0.5787396432841793, + "learning_rate": 0.003, + "loss": 4.3713, + "step": 1603 + }, + { + "epoch": 0.01604, + "grad_norm": 0.5028516500547362, + "learning_rate": 0.003, + "loss": 4.3577, + "step": 1604 + }, + { + "epoch": 0.01605, + "grad_norm": 0.4938813453051197, + "learning_rate": 0.003, + "loss": 4.3306, + "step": 1605 + }, + { + "epoch": 0.01606, + "grad_norm": 0.5541639051688618, + "learning_rate": 0.003, + "loss": 4.3313, + "step": 1606 + }, + { + "epoch": 0.01607, + "grad_norm": 0.5139661852261925, + "learning_rate": 0.003, + "loss": 4.3576, + "step": 1607 + }, + { + "epoch": 0.01608, + "grad_norm": 0.48348767192426967, + "learning_rate": 0.003, + "loss": 4.3253, + "step": 1608 + }, + { + "epoch": 0.01609, + "grad_norm": 0.40136757384139854, + "learning_rate": 0.003, + "loss": 4.3283, + "step": 1609 + }, + { + "epoch": 0.0161, + "grad_norm": 0.4218183451690125, + "learning_rate": 0.003, + "loss": 4.2908, + "step": 1610 + }, + { + "epoch": 0.01611, + "grad_norm": 0.460848545000983, + "learning_rate": 0.003, + "loss": 4.31, + "step": 1611 + }, + { + "epoch": 0.01612, + "grad_norm": 0.4659444292137467, + "learning_rate": 0.003, + "loss": 4.3003, + "step": 1612 + }, + { + "epoch": 0.01613, + "grad_norm": 0.4112363124807694, + "learning_rate": 0.003, + "loss": 4.3411, + "step": 1613 + }, + { + "epoch": 0.01614, + "grad_norm": 0.3883756371913985, + "learning_rate": 0.003, + "loss": 4.3624, + "step": 1614 + }, + { + "epoch": 0.01615, + "grad_norm": 0.3536906211225033, + "learning_rate": 0.003, + "loss": 4.3264, + "step": 1615 + }, + { + "epoch": 0.01616, + "grad_norm": 0.31345780077250956, + "learning_rate": 0.003, + "loss": 4.3029, + "step": 1616 + }, + { + "epoch": 0.01617, + "grad_norm": 0.30058841204286124, + "learning_rate": 0.003, + "loss": 4.2916, + "step": 1617 + }, + { + "epoch": 0.01618, + "grad_norm": 0.27199761658041965, + "learning_rate": 0.003, + "loss": 4.2925, + "step": 1618 + }, + { + "epoch": 0.01619, + "grad_norm": 0.24962840642226738, + "learning_rate": 0.003, + "loss": 4.2798, + "step": 1619 + }, + { + "epoch": 0.0162, + "grad_norm": 0.2624146116891587, + "learning_rate": 0.003, + "loss": 4.2863, + "step": 1620 + }, + { + "epoch": 0.01621, + "grad_norm": 0.2537767483196262, + "learning_rate": 0.003, + "loss": 4.2811, + "step": 1621 + }, + { + "epoch": 0.01622, + "grad_norm": 0.3339010418458089, + "learning_rate": 0.003, + "loss": 4.3143, + "step": 1622 + }, + { + "epoch": 0.01623, + "grad_norm": 0.4481356309307089, + "learning_rate": 0.003, + "loss": 4.2931, + "step": 1623 + }, + { + "epoch": 0.01624, + "grad_norm": 0.6923232940858078, + "learning_rate": 0.003, + "loss": 4.3139, + "step": 1624 + }, + { + "epoch": 0.01625, + "grad_norm": 0.796128597644668, + "learning_rate": 0.003, + "loss": 4.3039, + "step": 1625 + }, + { + "epoch": 0.01626, + "grad_norm": 0.4972515352836982, + "learning_rate": 0.003, + "loss": 4.3127, + "step": 1626 + }, + { + "epoch": 0.01627, + "grad_norm": 0.5373908693508559, + "learning_rate": 0.003, + "loss": 4.2845, + "step": 1627 + }, + { + "epoch": 0.01628, + "grad_norm": 0.576906019908009, + "learning_rate": 0.003, + "loss": 4.2936, + "step": 1628 + }, + { + "epoch": 0.01629, + "grad_norm": 0.4857532494288208, + "learning_rate": 0.003, + "loss": 4.2886, + "step": 1629 + }, + { + "epoch": 0.0163, + "grad_norm": 0.6536351383709068, + "learning_rate": 0.003, + "loss": 4.284, + "step": 1630 + }, + { + "epoch": 0.01631, + "grad_norm": 0.6136531914830949, + "learning_rate": 0.003, + "loss": 4.3036, + "step": 1631 + }, + { + "epoch": 0.01632, + "grad_norm": 0.5441543421309533, + "learning_rate": 0.003, + "loss": 4.3133, + "step": 1632 + }, + { + "epoch": 0.01633, + "grad_norm": 0.5546151460088474, + "learning_rate": 0.003, + "loss": 4.2852, + "step": 1633 + }, + { + "epoch": 0.01634, + "grad_norm": 0.5283961307973114, + "learning_rate": 0.003, + "loss": 4.3028, + "step": 1634 + }, + { + "epoch": 0.01635, + "grad_norm": 0.4867354243481517, + "learning_rate": 0.003, + "loss": 4.2796, + "step": 1635 + }, + { + "epoch": 0.01636, + "grad_norm": 0.565294038636392, + "learning_rate": 0.003, + "loss": 4.2994, + "step": 1636 + }, + { + "epoch": 0.01637, + "grad_norm": 0.571317155010679, + "learning_rate": 0.003, + "loss": 4.2844, + "step": 1637 + }, + { + "epoch": 0.01638, + "grad_norm": 0.5573027415974858, + "learning_rate": 0.003, + "loss": 4.2921, + "step": 1638 + }, + { + "epoch": 0.01639, + "grad_norm": 0.6223176506662222, + "learning_rate": 0.003, + "loss": 4.3165, + "step": 1639 + }, + { + "epoch": 0.0164, + "grad_norm": 0.692259125142237, + "learning_rate": 0.003, + "loss": 4.3033, + "step": 1640 + }, + { + "epoch": 0.01641, + "grad_norm": 0.5775190344518869, + "learning_rate": 0.003, + "loss": 4.2697, + "step": 1641 + }, + { + "epoch": 0.01642, + "grad_norm": 0.5727565418624034, + "learning_rate": 0.003, + "loss": 4.316, + "step": 1642 + }, + { + "epoch": 0.01643, + "grad_norm": 0.49414286165639626, + "learning_rate": 0.003, + "loss": 4.2959, + "step": 1643 + }, + { + "epoch": 0.01644, + "grad_norm": 0.4716768356076658, + "learning_rate": 0.003, + "loss": 4.315, + "step": 1644 + }, + { + "epoch": 0.01645, + "grad_norm": 0.4203296262888758, + "learning_rate": 0.003, + "loss": 4.2571, + "step": 1645 + }, + { + "epoch": 0.01646, + "grad_norm": 0.42896262234184174, + "learning_rate": 0.003, + "loss": 4.2952, + "step": 1646 + }, + { + "epoch": 0.01647, + "grad_norm": 0.4170543756256854, + "learning_rate": 0.003, + "loss": 4.2886, + "step": 1647 + }, + { + "epoch": 0.01648, + "grad_norm": 0.42426821016871286, + "learning_rate": 0.003, + "loss": 4.2659, + "step": 1648 + }, + { + "epoch": 0.01649, + "grad_norm": 0.4471925912241752, + "learning_rate": 0.003, + "loss": 4.3027, + "step": 1649 + }, + { + "epoch": 0.0165, + "grad_norm": 0.512725534748134, + "learning_rate": 0.003, + "loss": 4.2806, + "step": 1650 + }, + { + "epoch": 0.01651, + "grad_norm": 0.7005737333281123, + "learning_rate": 0.003, + "loss": 4.2999, + "step": 1651 + }, + { + "epoch": 0.01652, + "grad_norm": 0.7594561646979391, + "learning_rate": 0.003, + "loss": 4.2643, + "step": 1652 + }, + { + "epoch": 0.01653, + "grad_norm": 0.5870095922616082, + "learning_rate": 0.003, + "loss": 4.2994, + "step": 1653 + }, + { + "epoch": 0.01654, + "grad_norm": 0.6476208006318457, + "learning_rate": 0.003, + "loss": 4.289, + "step": 1654 + }, + { + "epoch": 0.01655, + "grad_norm": 0.6928825876927465, + "learning_rate": 0.003, + "loss": 4.3102, + "step": 1655 + }, + { + "epoch": 0.01656, + "grad_norm": 0.6117569915496687, + "learning_rate": 0.003, + "loss": 4.2725, + "step": 1656 + }, + { + "epoch": 0.01657, + "grad_norm": 0.614527863212692, + "learning_rate": 0.003, + "loss": 4.2915, + "step": 1657 + }, + { + "epoch": 0.01658, + "grad_norm": 0.5818340523746026, + "learning_rate": 0.003, + "loss": 4.2809, + "step": 1658 + }, + { + "epoch": 0.01659, + "grad_norm": 0.5122965623311254, + "learning_rate": 0.003, + "loss": 4.2829, + "step": 1659 + }, + { + "epoch": 0.0166, + "grad_norm": 0.41721830635550766, + "learning_rate": 0.003, + "loss": 4.2624, + "step": 1660 + }, + { + "epoch": 0.01661, + "grad_norm": 0.3974257488587888, + "learning_rate": 0.003, + "loss": 4.2659, + "step": 1661 + }, + { + "epoch": 0.01662, + "grad_norm": 0.4134218787914048, + "learning_rate": 0.003, + "loss": 4.2699, + "step": 1662 + }, + { + "epoch": 0.01663, + "grad_norm": 0.4716687664436207, + "learning_rate": 0.003, + "loss": 4.2937, + "step": 1663 + }, + { + "epoch": 0.01664, + "grad_norm": 0.5068611880377488, + "learning_rate": 0.003, + "loss": 4.31, + "step": 1664 + }, + { + "epoch": 0.01665, + "grad_norm": 0.49884944088963573, + "learning_rate": 0.003, + "loss": 4.2734, + "step": 1665 + }, + { + "epoch": 0.01666, + "grad_norm": 0.5134497113162678, + "learning_rate": 0.003, + "loss": 4.277, + "step": 1666 + }, + { + "epoch": 0.01667, + "grad_norm": 0.4951307012977702, + "learning_rate": 0.003, + "loss": 4.2916, + "step": 1667 + }, + { + "epoch": 0.01668, + "grad_norm": 0.45857349182650015, + "learning_rate": 0.003, + "loss": 4.293, + "step": 1668 + }, + { + "epoch": 0.01669, + "grad_norm": 0.49049575707127974, + "learning_rate": 0.003, + "loss": 4.2801, + "step": 1669 + }, + { + "epoch": 0.0167, + "grad_norm": 0.4594967732301719, + "learning_rate": 0.003, + "loss": 4.2692, + "step": 1670 + }, + { + "epoch": 0.01671, + "grad_norm": 0.5516090968921173, + "learning_rate": 0.003, + "loss": 4.2597, + "step": 1671 + }, + { + "epoch": 0.01672, + "grad_norm": 0.6238914771966855, + "learning_rate": 0.003, + "loss": 4.2771, + "step": 1672 + }, + { + "epoch": 0.01673, + "grad_norm": 0.6444039300913702, + "learning_rate": 0.003, + "loss": 4.2759, + "step": 1673 + }, + { + "epoch": 0.01674, + "grad_norm": 0.6140826000846973, + "learning_rate": 0.003, + "loss": 4.2936, + "step": 1674 + }, + { + "epoch": 0.01675, + "grad_norm": 0.5952627086225425, + "learning_rate": 0.003, + "loss": 4.2797, + "step": 1675 + }, + { + "epoch": 0.01676, + "grad_norm": 0.7397767991708332, + "learning_rate": 0.003, + "loss": 4.295, + "step": 1676 + }, + { + "epoch": 0.01677, + "grad_norm": 0.6917176398613138, + "learning_rate": 0.003, + "loss": 4.287, + "step": 1677 + }, + { + "epoch": 0.01678, + "grad_norm": 0.6139182789896808, + "learning_rate": 0.003, + "loss": 4.2961, + "step": 1678 + }, + { + "epoch": 0.01679, + "grad_norm": 0.5929821413447228, + "learning_rate": 0.003, + "loss": 4.28, + "step": 1679 + }, + { + "epoch": 0.0168, + "grad_norm": 0.5995626344940229, + "learning_rate": 0.003, + "loss": 4.2962, + "step": 1680 + }, + { + "epoch": 0.01681, + "grad_norm": 0.5718502872460155, + "learning_rate": 0.003, + "loss": 4.2946, + "step": 1681 + }, + { + "epoch": 0.01682, + "grad_norm": 0.5792048131668902, + "learning_rate": 0.003, + "loss": 4.2929, + "step": 1682 + }, + { + "epoch": 0.01683, + "grad_norm": 0.5439228911454255, + "learning_rate": 0.003, + "loss": 4.3021, + "step": 1683 + }, + { + "epoch": 0.01684, + "grad_norm": 0.5499788418796372, + "learning_rate": 0.003, + "loss": 4.2762, + "step": 1684 + }, + { + "epoch": 0.01685, + "grad_norm": 0.5923021072734571, + "learning_rate": 0.003, + "loss": 4.2948, + "step": 1685 + }, + { + "epoch": 0.01686, + "grad_norm": 0.5945832006013596, + "learning_rate": 0.003, + "loss": 4.2875, + "step": 1686 + }, + { + "epoch": 0.01687, + "grad_norm": 0.5137142109366883, + "learning_rate": 0.003, + "loss": 4.2801, + "step": 1687 + }, + { + "epoch": 0.01688, + "grad_norm": 0.5492618386501558, + "learning_rate": 0.003, + "loss": 4.2695, + "step": 1688 + }, + { + "epoch": 0.01689, + "grad_norm": 0.5905725806017027, + "learning_rate": 0.003, + "loss": 4.2834, + "step": 1689 + }, + { + "epoch": 0.0169, + "grad_norm": 0.6017735937539882, + "learning_rate": 0.003, + "loss": 4.2739, + "step": 1690 + }, + { + "epoch": 0.01691, + "grad_norm": 0.5609944056824007, + "learning_rate": 0.003, + "loss": 4.2833, + "step": 1691 + }, + { + "epoch": 0.01692, + "grad_norm": 0.640275407175671, + "learning_rate": 0.003, + "loss": 4.2898, + "step": 1692 + }, + { + "epoch": 0.01693, + "grad_norm": 0.6644174640253917, + "learning_rate": 0.003, + "loss": 4.3205, + "step": 1693 + }, + { + "epoch": 0.01694, + "grad_norm": 0.6415163103575349, + "learning_rate": 0.003, + "loss": 4.2685, + "step": 1694 + }, + { + "epoch": 0.01695, + "grad_norm": 0.5843260495529953, + "learning_rate": 0.003, + "loss": 4.2981, + "step": 1695 + }, + { + "epoch": 0.01696, + "grad_norm": 0.6242095105916499, + "learning_rate": 0.003, + "loss": 4.2919, + "step": 1696 + }, + { + "epoch": 0.01697, + "grad_norm": 0.6266335557548692, + "learning_rate": 0.003, + "loss": 4.2683, + "step": 1697 + }, + { + "epoch": 0.01698, + "grad_norm": 0.5465027100918087, + "learning_rate": 0.003, + "loss": 4.2606, + "step": 1698 + }, + { + "epoch": 0.01699, + "grad_norm": 0.4757955199873524, + "learning_rate": 0.003, + "loss": 4.2671, + "step": 1699 + }, + { + "epoch": 0.017, + "grad_norm": 0.5193155427226998, + "learning_rate": 0.003, + "loss": 4.2877, + "step": 1700 + }, + { + "epoch": 0.01701, + "grad_norm": 0.5016711628234212, + "learning_rate": 0.003, + "loss": 4.2685, + "step": 1701 + }, + { + "epoch": 0.01702, + "grad_norm": 0.5283542213739725, + "learning_rate": 0.003, + "loss": 4.2934, + "step": 1702 + }, + { + "epoch": 0.01703, + "grad_norm": 0.5693915553392532, + "learning_rate": 0.003, + "loss": 4.2816, + "step": 1703 + }, + { + "epoch": 0.01704, + "grad_norm": 0.5510842689706329, + "learning_rate": 0.003, + "loss": 4.2655, + "step": 1704 + }, + { + "epoch": 0.01705, + "grad_norm": 0.5433284015620915, + "learning_rate": 0.003, + "loss": 4.2713, + "step": 1705 + }, + { + "epoch": 0.01706, + "grad_norm": 0.5361651744968902, + "learning_rate": 0.003, + "loss": 4.2666, + "step": 1706 + }, + { + "epoch": 0.01707, + "grad_norm": 0.49807673030468796, + "learning_rate": 0.003, + "loss": 4.2867, + "step": 1707 + }, + { + "epoch": 0.01708, + "grad_norm": 0.47060326864649304, + "learning_rate": 0.003, + "loss": 4.2615, + "step": 1708 + }, + { + "epoch": 0.01709, + "grad_norm": 0.5858233174308028, + "learning_rate": 0.003, + "loss": 4.2895, + "step": 1709 + }, + { + "epoch": 0.0171, + "grad_norm": 0.6958457968419427, + "learning_rate": 0.003, + "loss": 4.2423, + "step": 1710 + }, + { + "epoch": 0.01711, + "grad_norm": 0.6480228055861614, + "learning_rate": 0.003, + "loss": 4.2726, + "step": 1711 + }, + { + "epoch": 0.01712, + "grad_norm": 0.5133833526312795, + "learning_rate": 0.003, + "loss": 4.2785, + "step": 1712 + }, + { + "epoch": 0.01713, + "grad_norm": 0.6473419774088255, + "learning_rate": 0.003, + "loss": 4.2578, + "step": 1713 + }, + { + "epoch": 0.01714, + "grad_norm": 0.6885593409140894, + "learning_rate": 0.003, + "loss": 4.3021, + "step": 1714 + }, + { + "epoch": 0.01715, + "grad_norm": 0.7110176805262337, + "learning_rate": 0.003, + "loss": 4.3086, + "step": 1715 + }, + { + "epoch": 0.01716, + "grad_norm": 0.6383663373581888, + "learning_rate": 0.003, + "loss": 4.284, + "step": 1716 + }, + { + "epoch": 0.01717, + "grad_norm": 0.5609012358245258, + "learning_rate": 0.003, + "loss": 4.3151, + "step": 1717 + }, + { + "epoch": 0.01718, + "grad_norm": 0.4893869305325425, + "learning_rate": 0.003, + "loss": 4.279, + "step": 1718 + }, + { + "epoch": 0.01719, + "grad_norm": 0.4683591562635787, + "learning_rate": 0.003, + "loss": 4.3002, + "step": 1719 + }, + { + "epoch": 0.0172, + "grad_norm": 0.43007495593928735, + "learning_rate": 0.003, + "loss": 4.2576, + "step": 1720 + }, + { + "epoch": 0.01721, + "grad_norm": 0.4793855122225898, + "learning_rate": 0.003, + "loss": 4.2731, + "step": 1721 + }, + { + "epoch": 0.01722, + "grad_norm": 0.5257529092437175, + "learning_rate": 0.003, + "loss": 4.2675, + "step": 1722 + }, + { + "epoch": 0.01723, + "grad_norm": 0.6135435997236554, + "learning_rate": 0.003, + "loss": 4.2811, + "step": 1723 + }, + { + "epoch": 0.01724, + "grad_norm": 0.6429236565141022, + "learning_rate": 0.003, + "loss": 4.2853, + "step": 1724 + }, + { + "epoch": 0.01725, + "grad_norm": 0.6238007075890811, + "learning_rate": 0.003, + "loss": 4.27, + "step": 1725 + }, + { + "epoch": 0.01726, + "grad_norm": 0.5743130238790891, + "learning_rate": 0.003, + "loss": 4.2614, + "step": 1726 + }, + { + "epoch": 0.01727, + "grad_norm": 0.5512027746636241, + "learning_rate": 0.003, + "loss": 4.266, + "step": 1727 + }, + { + "epoch": 0.01728, + "grad_norm": 0.5565544152920284, + "learning_rate": 0.003, + "loss": 4.2687, + "step": 1728 + }, + { + "epoch": 0.01729, + "grad_norm": 0.6053220105417224, + "learning_rate": 0.003, + "loss": 4.2796, + "step": 1729 + }, + { + "epoch": 0.0173, + "grad_norm": 0.7004864603830978, + "learning_rate": 0.003, + "loss": 4.256, + "step": 1730 + }, + { + "epoch": 0.01731, + "grad_norm": 0.6677625209808086, + "learning_rate": 0.003, + "loss": 4.2797, + "step": 1731 + }, + { + "epoch": 0.01732, + "grad_norm": 0.6458934000853458, + "learning_rate": 0.003, + "loss": 4.2877, + "step": 1732 + }, + { + "epoch": 0.01733, + "grad_norm": 0.7455706085018527, + "learning_rate": 0.003, + "loss": 4.2732, + "step": 1733 + }, + { + "epoch": 0.01734, + "grad_norm": 0.6265592537450486, + "learning_rate": 0.003, + "loss": 4.2823, + "step": 1734 + }, + { + "epoch": 0.01735, + "grad_norm": 0.5392857728589878, + "learning_rate": 0.003, + "loss": 4.2489, + "step": 1735 + }, + { + "epoch": 0.01736, + "grad_norm": 0.5707588496610313, + "learning_rate": 0.003, + "loss": 4.2503, + "step": 1736 + }, + { + "epoch": 0.01737, + "grad_norm": 0.5560424700678082, + "learning_rate": 0.003, + "loss": 4.2812, + "step": 1737 + }, + { + "epoch": 0.01738, + "grad_norm": 0.5730683002180033, + "learning_rate": 0.003, + "loss": 4.2686, + "step": 1738 + }, + { + "epoch": 0.01739, + "grad_norm": 0.6224581195400443, + "learning_rate": 0.003, + "loss": 4.2874, + "step": 1739 + }, + { + "epoch": 0.0174, + "grad_norm": 0.6572820925374313, + "learning_rate": 0.003, + "loss": 4.2884, + "step": 1740 + }, + { + "epoch": 0.01741, + "grad_norm": 0.7158790913353127, + "learning_rate": 0.003, + "loss": 4.3015, + "step": 1741 + }, + { + "epoch": 0.01742, + "grad_norm": 0.7666622653552904, + "learning_rate": 0.003, + "loss": 4.2911, + "step": 1742 + }, + { + "epoch": 0.01743, + "grad_norm": 0.7426449847041513, + "learning_rate": 0.003, + "loss": 4.3102, + "step": 1743 + }, + { + "epoch": 0.01744, + "grad_norm": 0.7275834660740842, + "learning_rate": 0.003, + "loss": 4.2916, + "step": 1744 + }, + { + "epoch": 0.01745, + "grad_norm": 0.6002910482818848, + "learning_rate": 0.003, + "loss": 4.2539, + "step": 1745 + }, + { + "epoch": 0.01746, + "grad_norm": 0.5594118226409661, + "learning_rate": 0.003, + "loss": 4.2741, + "step": 1746 + }, + { + "epoch": 0.01747, + "grad_norm": 0.5290431452891758, + "learning_rate": 0.003, + "loss": 4.2708, + "step": 1747 + }, + { + "epoch": 0.01748, + "grad_norm": 0.5055142153030036, + "learning_rate": 0.003, + "loss": 4.263, + "step": 1748 + }, + { + "epoch": 0.01749, + "grad_norm": 0.5966872692997083, + "learning_rate": 0.003, + "loss": 4.2613, + "step": 1749 + }, + { + "epoch": 0.0175, + "grad_norm": 0.5900848154260279, + "learning_rate": 0.003, + "loss": 4.2801, + "step": 1750 + }, + { + "epoch": 0.01751, + "grad_norm": 0.6245345037465497, + "learning_rate": 0.003, + "loss": 4.2737, + "step": 1751 + }, + { + "epoch": 0.01752, + "grad_norm": 0.5951652358284407, + "learning_rate": 0.003, + "loss": 4.3127, + "step": 1752 + }, + { + "epoch": 0.01753, + "grad_norm": 0.5691433797032257, + "learning_rate": 0.003, + "loss": 4.2613, + "step": 1753 + }, + { + "epoch": 0.01754, + "grad_norm": 0.5243977651759597, + "learning_rate": 0.003, + "loss": 4.2904, + "step": 1754 + }, + { + "epoch": 0.01755, + "grad_norm": 0.5047009634705759, + "learning_rate": 0.003, + "loss": 4.23, + "step": 1755 + }, + { + "epoch": 0.01756, + "grad_norm": 0.5542549196861234, + "learning_rate": 0.003, + "loss": 4.3093, + "step": 1756 + }, + { + "epoch": 0.01757, + "grad_norm": 0.5493798919225148, + "learning_rate": 0.003, + "loss": 4.2887, + "step": 1757 + }, + { + "epoch": 0.01758, + "grad_norm": 0.5731223181742938, + "learning_rate": 0.003, + "loss": 4.2949, + "step": 1758 + }, + { + "epoch": 0.01759, + "grad_norm": 0.5640596577781694, + "learning_rate": 0.003, + "loss": 4.2619, + "step": 1759 + }, + { + "epoch": 0.0176, + "grad_norm": 0.49712782984125864, + "learning_rate": 0.003, + "loss": 4.2816, + "step": 1760 + }, + { + "epoch": 0.01761, + "grad_norm": 0.49486559076430775, + "learning_rate": 0.003, + "loss": 4.2648, + "step": 1761 + }, + { + "epoch": 0.01762, + "grad_norm": 0.5110004857223106, + "learning_rate": 0.003, + "loss": 4.2854, + "step": 1762 + }, + { + "epoch": 0.01763, + "grad_norm": 0.5633857856922079, + "learning_rate": 0.003, + "loss": 4.2792, + "step": 1763 + }, + { + "epoch": 0.01764, + "grad_norm": 0.7662844615211925, + "learning_rate": 0.003, + "loss": 4.255, + "step": 1764 + }, + { + "epoch": 0.01765, + "grad_norm": 0.8231706617421852, + "learning_rate": 0.003, + "loss": 4.2957, + "step": 1765 + }, + { + "epoch": 0.01766, + "grad_norm": 0.6112884625612803, + "learning_rate": 0.003, + "loss": 4.2946, + "step": 1766 + }, + { + "epoch": 0.01767, + "grad_norm": 0.5304115562781407, + "learning_rate": 0.003, + "loss": 4.2562, + "step": 1767 + }, + { + "epoch": 0.01768, + "grad_norm": 0.5693392116057188, + "learning_rate": 0.003, + "loss": 4.2663, + "step": 1768 + }, + { + "epoch": 0.01769, + "grad_norm": 0.5007419067309367, + "learning_rate": 0.003, + "loss": 4.2627, + "step": 1769 + }, + { + "epoch": 0.0177, + "grad_norm": 0.5337006106644646, + "learning_rate": 0.003, + "loss": 4.2481, + "step": 1770 + }, + { + "epoch": 0.01771, + "grad_norm": 0.4993647653872291, + "learning_rate": 0.003, + "loss": 4.278, + "step": 1771 + }, + { + "epoch": 0.01772, + "grad_norm": 0.49378669154496685, + "learning_rate": 0.003, + "loss": 4.2492, + "step": 1772 + }, + { + "epoch": 0.01773, + "grad_norm": 0.49668963256381116, + "learning_rate": 0.003, + "loss": 4.2435, + "step": 1773 + }, + { + "epoch": 0.01774, + "grad_norm": 0.4890453055402567, + "learning_rate": 0.003, + "loss": 4.2647, + "step": 1774 + }, + { + "epoch": 0.01775, + "grad_norm": 0.4981133550102386, + "learning_rate": 0.003, + "loss": 4.2509, + "step": 1775 + }, + { + "epoch": 0.01776, + "grad_norm": 0.5331679722135182, + "learning_rate": 0.003, + "loss": 4.2445, + "step": 1776 + }, + { + "epoch": 0.01777, + "grad_norm": 0.5614293963448983, + "learning_rate": 0.003, + "loss": 4.269, + "step": 1777 + }, + { + "epoch": 0.01778, + "grad_norm": 0.5337530525849652, + "learning_rate": 0.003, + "loss": 4.2591, + "step": 1778 + }, + { + "epoch": 0.01779, + "grad_norm": 0.5219382960786771, + "learning_rate": 0.003, + "loss": 4.277, + "step": 1779 + }, + { + "epoch": 0.0178, + "grad_norm": 0.4801993403704907, + "learning_rate": 0.003, + "loss": 4.2462, + "step": 1780 + }, + { + "epoch": 0.01781, + "grad_norm": 0.5353966732851616, + "learning_rate": 0.003, + "loss": 4.2633, + "step": 1781 + }, + { + "epoch": 0.01782, + "grad_norm": 0.5707424190875742, + "learning_rate": 0.003, + "loss": 4.2884, + "step": 1782 + }, + { + "epoch": 0.01783, + "grad_norm": 0.6083204008367944, + "learning_rate": 0.003, + "loss": 4.269, + "step": 1783 + }, + { + "epoch": 0.01784, + "grad_norm": 0.5414123353142472, + "learning_rate": 0.003, + "loss": 4.2666, + "step": 1784 + }, + { + "epoch": 0.01785, + "grad_norm": 0.6099552330414602, + "learning_rate": 0.003, + "loss": 4.2322, + "step": 1785 + }, + { + "epoch": 0.01786, + "grad_norm": 0.6938449080529763, + "learning_rate": 0.003, + "loss": 4.2624, + "step": 1786 + }, + { + "epoch": 0.01787, + "grad_norm": 0.7097606505143458, + "learning_rate": 0.003, + "loss": 4.2787, + "step": 1787 + }, + { + "epoch": 0.01788, + "grad_norm": 0.7987231782657276, + "learning_rate": 0.003, + "loss": 4.29, + "step": 1788 + }, + { + "epoch": 0.01789, + "grad_norm": 0.9170384713763422, + "learning_rate": 0.003, + "loss": 4.2752, + "step": 1789 + }, + { + "epoch": 0.0179, + "grad_norm": 0.8912339815181158, + "learning_rate": 0.003, + "loss": 4.2973, + "step": 1790 + }, + { + "epoch": 0.01791, + "grad_norm": 0.7681052193269913, + "learning_rate": 0.003, + "loss": 4.3056, + "step": 1791 + }, + { + "epoch": 0.01792, + "grad_norm": 0.6429477629622415, + "learning_rate": 0.003, + "loss": 4.3028, + "step": 1792 + }, + { + "epoch": 0.01793, + "grad_norm": 0.7186534811578004, + "learning_rate": 0.003, + "loss": 4.2729, + "step": 1793 + }, + { + "epoch": 0.01794, + "grad_norm": 0.5765860920967305, + "learning_rate": 0.003, + "loss": 4.2799, + "step": 1794 + }, + { + "epoch": 0.01795, + "grad_norm": 0.5000657697525336, + "learning_rate": 0.003, + "loss": 4.2762, + "step": 1795 + }, + { + "epoch": 0.01796, + "grad_norm": 0.5320047446020336, + "learning_rate": 0.003, + "loss": 4.2904, + "step": 1796 + }, + { + "epoch": 0.01797, + "grad_norm": 0.4504365845865774, + "learning_rate": 0.003, + "loss": 4.2509, + "step": 1797 + }, + { + "epoch": 0.01798, + "grad_norm": 0.40462213723419516, + "learning_rate": 0.003, + "loss": 4.2636, + "step": 1798 + }, + { + "epoch": 0.01799, + "grad_norm": 0.35282082316813834, + "learning_rate": 0.003, + "loss": 4.2763, + "step": 1799 + }, + { + "epoch": 0.018, + "grad_norm": 0.3411687376041599, + "learning_rate": 0.003, + "loss": 4.2632, + "step": 1800 + }, + { + "epoch": 0.01801, + "grad_norm": 0.3497200911452386, + "learning_rate": 0.003, + "loss": 4.2568, + "step": 1801 + }, + { + "epoch": 0.01802, + "grad_norm": 0.35422809832207447, + "learning_rate": 0.003, + "loss": 4.2731, + "step": 1802 + }, + { + "epoch": 0.01803, + "grad_norm": 0.3279859543333952, + "learning_rate": 0.003, + "loss": 4.277, + "step": 1803 + }, + { + "epoch": 0.01804, + "grad_norm": 0.37278065596161997, + "learning_rate": 0.003, + "loss": 4.2479, + "step": 1804 + }, + { + "epoch": 0.01805, + "grad_norm": 0.4167024120969314, + "learning_rate": 0.003, + "loss": 4.268, + "step": 1805 + }, + { + "epoch": 0.01806, + "grad_norm": 0.45338026173808493, + "learning_rate": 0.003, + "loss": 4.228, + "step": 1806 + }, + { + "epoch": 0.01807, + "grad_norm": 0.4492923467683076, + "learning_rate": 0.003, + "loss": 4.2649, + "step": 1807 + }, + { + "epoch": 0.01808, + "grad_norm": 0.4519532920743694, + "learning_rate": 0.003, + "loss": 4.2552, + "step": 1808 + }, + { + "epoch": 0.01809, + "grad_norm": 0.4794685836261005, + "learning_rate": 0.003, + "loss": 4.2751, + "step": 1809 + }, + { + "epoch": 0.0181, + "grad_norm": 0.6301644557370442, + "learning_rate": 0.003, + "loss": 4.275, + "step": 1810 + }, + { + "epoch": 0.01811, + "grad_norm": 0.8128009946323582, + "learning_rate": 0.003, + "loss": 4.2718, + "step": 1811 + }, + { + "epoch": 0.01812, + "grad_norm": 0.8427115921852621, + "learning_rate": 0.003, + "loss": 4.2615, + "step": 1812 + }, + { + "epoch": 0.01813, + "grad_norm": 0.7199083817416421, + "learning_rate": 0.003, + "loss": 4.3052, + "step": 1813 + }, + { + "epoch": 0.01814, + "grad_norm": 0.7440856146842654, + "learning_rate": 0.003, + "loss": 4.2773, + "step": 1814 + }, + { + "epoch": 0.01815, + "grad_norm": 0.6604668614903264, + "learning_rate": 0.003, + "loss": 4.2603, + "step": 1815 + }, + { + "epoch": 0.01816, + "grad_norm": 0.7213183194060501, + "learning_rate": 0.003, + "loss": 4.289, + "step": 1816 + }, + { + "epoch": 0.01817, + "grad_norm": 0.707160301614863, + "learning_rate": 0.003, + "loss": 4.2802, + "step": 1817 + }, + { + "epoch": 0.01818, + "grad_norm": 0.636608110327377, + "learning_rate": 0.003, + "loss": 4.2785, + "step": 1818 + }, + { + "epoch": 0.01819, + "grad_norm": 0.5804906977024753, + "learning_rate": 0.003, + "loss": 4.2581, + "step": 1819 + }, + { + "epoch": 0.0182, + "grad_norm": 0.5372919614803818, + "learning_rate": 0.003, + "loss": 4.2768, + "step": 1820 + }, + { + "epoch": 0.01821, + "grad_norm": 0.5873367244339422, + "learning_rate": 0.003, + "loss": 4.2652, + "step": 1821 + }, + { + "epoch": 0.01822, + "grad_norm": 0.6367155622372229, + "learning_rate": 0.003, + "loss": 4.278, + "step": 1822 + }, + { + "epoch": 0.01823, + "grad_norm": 0.7098801106756836, + "learning_rate": 0.003, + "loss": 4.2989, + "step": 1823 + }, + { + "epoch": 0.01824, + "grad_norm": 0.6249291972074501, + "learning_rate": 0.003, + "loss": 4.2755, + "step": 1824 + }, + { + "epoch": 0.01825, + "grad_norm": 0.5242106659607212, + "learning_rate": 0.003, + "loss": 4.2762, + "step": 1825 + }, + { + "epoch": 0.01826, + "grad_norm": 0.644099441571583, + "learning_rate": 0.003, + "loss": 4.2915, + "step": 1826 + }, + { + "epoch": 0.01827, + "grad_norm": 0.6224752218569206, + "learning_rate": 0.003, + "loss": 4.2591, + "step": 1827 + }, + { + "epoch": 0.01828, + "grad_norm": 0.543186641981227, + "learning_rate": 0.003, + "loss": 4.2695, + "step": 1828 + }, + { + "epoch": 0.01829, + "grad_norm": 0.6845506683310987, + "learning_rate": 0.003, + "loss": 4.2898, + "step": 1829 + }, + { + "epoch": 0.0183, + "grad_norm": 0.879601600440537, + "learning_rate": 0.003, + "loss": 4.2972, + "step": 1830 + }, + { + "epoch": 0.01831, + "grad_norm": 0.9844682114288815, + "learning_rate": 0.003, + "loss": 4.2992, + "step": 1831 + }, + { + "epoch": 0.01832, + "grad_norm": 0.8821724273705098, + "learning_rate": 0.003, + "loss": 4.2978, + "step": 1832 + }, + { + "epoch": 0.01833, + "grad_norm": 0.7330000924703792, + "learning_rate": 0.003, + "loss": 4.3267, + "step": 1833 + }, + { + "epoch": 0.01834, + "grad_norm": 0.68997205114043, + "learning_rate": 0.003, + "loss": 4.2918, + "step": 1834 + }, + { + "epoch": 0.01835, + "grad_norm": 0.7006645684897048, + "learning_rate": 0.003, + "loss": 4.2921, + "step": 1835 + }, + { + "epoch": 0.01836, + "grad_norm": 0.7370828132611139, + "learning_rate": 0.003, + "loss": 4.2868, + "step": 1836 + }, + { + "epoch": 0.01837, + "grad_norm": 0.8137814608022381, + "learning_rate": 0.003, + "loss": 4.2934, + "step": 1837 + }, + { + "epoch": 0.01838, + "grad_norm": 0.6630708544495384, + "learning_rate": 0.003, + "loss": 4.2935, + "step": 1838 + }, + { + "epoch": 0.01839, + "grad_norm": 0.6770506893324072, + "learning_rate": 0.003, + "loss": 4.3163, + "step": 1839 + }, + { + "epoch": 0.0184, + "grad_norm": 0.772243829580562, + "learning_rate": 0.003, + "loss": 4.315, + "step": 1840 + }, + { + "epoch": 0.01841, + "grad_norm": 0.7260652894274943, + "learning_rate": 0.003, + "loss": 4.2507, + "step": 1841 + }, + { + "epoch": 0.01842, + "grad_norm": 0.7455618963662838, + "learning_rate": 0.003, + "loss": 4.2889, + "step": 1842 + }, + { + "epoch": 0.01843, + "grad_norm": 0.5629263660723788, + "learning_rate": 0.003, + "loss": 4.2632, + "step": 1843 + }, + { + "epoch": 0.01844, + "grad_norm": 0.44782374021160304, + "learning_rate": 0.003, + "loss": 4.2542, + "step": 1844 + }, + { + "epoch": 0.01845, + "grad_norm": 0.44338335058672285, + "learning_rate": 0.003, + "loss": 4.2569, + "step": 1845 + }, + { + "epoch": 0.01846, + "grad_norm": 0.3533576160559802, + "learning_rate": 0.003, + "loss": 4.2775, + "step": 1846 + }, + { + "epoch": 0.01847, + "grad_norm": 0.36624369667968887, + "learning_rate": 0.003, + "loss": 4.2733, + "step": 1847 + }, + { + "epoch": 0.01848, + "grad_norm": 0.3515311739049859, + "learning_rate": 0.003, + "loss": 4.2847, + "step": 1848 + }, + { + "epoch": 0.01849, + "grad_norm": 0.31316963159896893, + "learning_rate": 0.003, + "loss": 4.2827, + "step": 1849 + }, + { + "epoch": 0.0185, + "grad_norm": 0.3416820848274596, + "learning_rate": 0.003, + "loss": 4.2703, + "step": 1850 + }, + { + "epoch": 0.01851, + "grad_norm": 0.3489689830102001, + "learning_rate": 0.003, + "loss": 4.2512, + "step": 1851 + }, + { + "epoch": 0.01852, + "grad_norm": 0.3703418463232587, + "learning_rate": 0.003, + "loss": 4.2517, + "step": 1852 + }, + { + "epoch": 0.01853, + "grad_norm": 0.41368285825954554, + "learning_rate": 0.003, + "loss": 4.2519, + "step": 1853 + }, + { + "epoch": 0.01854, + "grad_norm": 0.43320899613747116, + "learning_rate": 0.003, + "loss": 4.2834, + "step": 1854 + }, + { + "epoch": 0.01855, + "grad_norm": 0.5253237000651575, + "learning_rate": 0.003, + "loss": 4.2689, + "step": 1855 + }, + { + "epoch": 0.01856, + "grad_norm": 0.5779002753152843, + "learning_rate": 0.003, + "loss": 4.2553, + "step": 1856 + }, + { + "epoch": 0.01857, + "grad_norm": 0.593959858084494, + "learning_rate": 0.003, + "loss": 4.241, + "step": 1857 + }, + { + "epoch": 0.01858, + "grad_norm": 0.5126648296135959, + "learning_rate": 0.003, + "loss": 4.2479, + "step": 1858 + }, + { + "epoch": 0.01859, + "grad_norm": 0.45559130929525077, + "learning_rate": 0.003, + "loss": 4.2412, + "step": 1859 + }, + { + "epoch": 0.0186, + "grad_norm": 0.4806055065781757, + "learning_rate": 0.003, + "loss": 4.2254, + "step": 1860 + }, + { + "epoch": 0.01861, + "grad_norm": 0.48294706675854066, + "learning_rate": 0.003, + "loss": 4.2694, + "step": 1861 + }, + { + "epoch": 0.01862, + "grad_norm": 0.4664947919884159, + "learning_rate": 0.003, + "loss": 4.2335, + "step": 1862 + }, + { + "epoch": 0.01863, + "grad_norm": 0.4819198640587951, + "learning_rate": 0.003, + "loss": 4.2428, + "step": 1863 + }, + { + "epoch": 0.01864, + "grad_norm": 0.5946115673610074, + "learning_rate": 0.003, + "loss": 4.2797, + "step": 1864 + }, + { + "epoch": 0.01865, + "grad_norm": 0.7290398384671329, + "learning_rate": 0.003, + "loss": 4.2637, + "step": 1865 + }, + { + "epoch": 0.01866, + "grad_norm": 0.8160446708452547, + "learning_rate": 0.003, + "loss": 4.2688, + "step": 1866 + }, + { + "epoch": 0.01867, + "grad_norm": 0.7596737913536388, + "learning_rate": 0.003, + "loss": 4.2479, + "step": 1867 + }, + { + "epoch": 0.01868, + "grad_norm": 0.7512897467429733, + "learning_rate": 0.003, + "loss": 4.265, + "step": 1868 + }, + { + "epoch": 0.01869, + "grad_norm": 0.899865336798572, + "learning_rate": 0.003, + "loss": 4.2896, + "step": 1869 + }, + { + "epoch": 0.0187, + "grad_norm": 0.8087963711485776, + "learning_rate": 0.003, + "loss": 4.2851, + "step": 1870 + }, + { + "epoch": 0.01871, + "grad_norm": 0.6481673959520611, + "learning_rate": 0.003, + "loss": 4.2524, + "step": 1871 + }, + { + "epoch": 0.01872, + "grad_norm": 0.5923778344000376, + "learning_rate": 0.003, + "loss": 4.2969, + "step": 1872 + }, + { + "epoch": 0.01873, + "grad_norm": 0.5580453248479986, + "learning_rate": 0.003, + "loss": 4.2558, + "step": 1873 + }, + { + "epoch": 0.01874, + "grad_norm": 0.6129883764486191, + "learning_rate": 0.003, + "loss": 4.2619, + "step": 1874 + }, + { + "epoch": 0.01875, + "grad_norm": 0.5660814639175555, + "learning_rate": 0.003, + "loss": 4.2767, + "step": 1875 + }, + { + "epoch": 0.01876, + "grad_norm": 0.5484550288366968, + "learning_rate": 0.003, + "loss": 4.2804, + "step": 1876 + }, + { + "epoch": 0.01877, + "grad_norm": 0.5413620281670742, + "learning_rate": 0.003, + "loss": 4.2852, + "step": 1877 + }, + { + "epoch": 0.01878, + "grad_norm": 0.5151059596185084, + "learning_rate": 0.003, + "loss": 4.2658, + "step": 1878 + }, + { + "epoch": 0.01879, + "grad_norm": 0.5108820927917921, + "learning_rate": 0.003, + "loss": 4.2428, + "step": 1879 + }, + { + "epoch": 0.0188, + "grad_norm": 0.480687658081291, + "learning_rate": 0.003, + "loss": 4.253, + "step": 1880 + }, + { + "epoch": 0.01881, + "grad_norm": 0.4006710963490709, + "learning_rate": 0.003, + "loss": 4.2442, + "step": 1881 + }, + { + "epoch": 0.01882, + "grad_norm": 0.4041555926045392, + "learning_rate": 0.003, + "loss": 4.2713, + "step": 1882 + }, + { + "epoch": 0.01883, + "grad_norm": 0.35889370988759256, + "learning_rate": 0.003, + "loss": 4.2426, + "step": 1883 + }, + { + "epoch": 0.01884, + "grad_norm": 0.3548857794793813, + "learning_rate": 0.003, + "loss": 4.2417, + "step": 1884 + }, + { + "epoch": 0.01885, + "grad_norm": 0.35029061193490524, + "learning_rate": 0.003, + "loss": 4.2179, + "step": 1885 + }, + { + "epoch": 0.01886, + "grad_norm": 0.44501447232013214, + "learning_rate": 0.003, + "loss": 4.2578, + "step": 1886 + }, + { + "epoch": 0.01887, + "grad_norm": 0.5229549741250542, + "learning_rate": 0.003, + "loss": 4.2541, + "step": 1887 + }, + { + "epoch": 0.01888, + "grad_norm": 0.6745619001945896, + "learning_rate": 0.003, + "loss": 4.2454, + "step": 1888 + }, + { + "epoch": 0.01889, + "grad_norm": 0.6923119713487843, + "learning_rate": 0.003, + "loss": 4.265, + "step": 1889 + }, + { + "epoch": 0.0189, + "grad_norm": 0.58679702829643, + "learning_rate": 0.003, + "loss": 4.2409, + "step": 1890 + }, + { + "epoch": 0.01891, + "grad_norm": 0.646800864954939, + "learning_rate": 0.003, + "loss": 4.2919, + "step": 1891 + }, + { + "epoch": 0.01892, + "grad_norm": 0.6502623001706804, + "learning_rate": 0.003, + "loss": 4.2707, + "step": 1892 + }, + { + "epoch": 0.01893, + "grad_norm": 0.5615534003702374, + "learning_rate": 0.003, + "loss": 4.2469, + "step": 1893 + }, + { + "epoch": 0.01894, + "grad_norm": 0.5991987877315249, + "learning_rate": 0.003, + "loss": 4.254, + "step": 1894 + }, + { + "epoch": 0.01895, + "grad_norm": 0.5685602484818931, + "learning_rate": 0.003, + "loss": 4.2409, + "step": 1895 + }, + { + "epoch": 0.01896, + "grad_norm": 0.5273740611308925, + "learning_rate": 0.003, + "loss": 4.2621, + "step": 1896 + }, + { + "epoch": 0.01897, + "grad_norm": 0.4634723554903588, + "learning_rate": 0.003, + "loss": 4.2587, + "step": 1897 + }, + { + "epoch": 0.01898, + "grad_norm": 0.49012578983884775, + "learning_rate": 0.003, + "loss": 4.2271, + "step": 1898 + }, + { + "epoch": 0.01899, + "grad_norm": 0.5391967127125681, + "learning_rate": 0.003, + "loss": 4.2462, + "step": 1899 + }, + { + "epoch": 0.019, + "grad_norm": 0.5314902668241436, + "learning_rate": 0.003, + "loss": 4.2384, + "step": 1900 + }, + { + "epoch": 0.01901, + "grad_norm": 0.5694638994637579, + "learning_rate": 0.003, + "loss": 4.2871, + "step": 1901 + }, + { + "epoch": 0.01902, + "grad_norm": 0.5590323874513298, + "learning_rate": 0.003, + "loss": 4.2641, + "step": 1902 + }, + { + "epoch": 0.01903, + "grad_norm": 0.5475545528889447, + "learning_rate": 0.003, + "loss": 4.2196, + "step": 1903 + }, + { + "epoch": 0.01904, + "grad_norm": 0.6218453695681301, + "learning_rate": 0.003, + "loss": 4.2791, + "step": 1904 + }, + { + "epoch": 0.01905, + "grad_norm": 0.591162548550656, + "learning_rate": 0.003, + "loss": 4.2534, + "step": 1905 + }, + { + "epoch": 0.01906, + "grad_norm": 0.5128889662513827, + "learning_rate": 0.003, + "loss": 4.2676, + "step": 1906 + }, + { + "epoch": 0.01907, + "grad_norm": 0.5047288261347648, + "learning_rate": 0.003, + "loss": 4.2599, + "step": 1907 + }, + { + "epoch": 0.01908, + "grad_norm": 0.4667283203300201, + "learning_rate": 0.003, + "loss": 4.2726, + "step": 1908 + }, + { + "epoch": 0.01909, + "grad_norm": 0.5022149031562609, + "learning_rate": 0.003, + "loss": 4.2382, + "step": 1909 + }, + { + "epoch": 0.0191, + "grad_norm": 0.5432392826551734, + "learning_rate": 0.003, + "loss": 4.2518, + "step": 1910 + }, + { + "epoch": 0.01911, + "grad_norm": 0.49523295523680083, + "learning_rate": 0.003, + "loss": 4.2241, + "step": 1911 + }, + { + "epoch": 0.01912, + "grad_norm": 0.5206775073204631, + "learning_rate": 0.003, + "loss": 4.2363, + "step": 1912 + }, + { + "epoch": 0.01913, + "grad_norm": 0.523325426364381, + "learning_rate": 0.003, + "loss": 4.2544, + "step": 1913 + }, + { + "epoch": 0.01914, + "grad_norm": 0.5843680788676122, + "learning_rate": 0.003, + "loss": 4.2743, + "step": 1914 + }, + { + "epoch": 0.01915, + "grad_norm": 0.6905336390991205, + "learning_rate": 0.003, + "loss": 4.2632, + "step": 1915 + }, + { + "epoch": 0.01916, + "grad_norm": 0.8314031188331161, + "learning_rate": 0.003, + "loss": 4.2512, + "step": 1916 + }, + { + "epoch": 0.01917, + "grad_norm": 0.9750007322129252, + "learning_rate": 0.003, + "loss": 4.2904, + "step": 1917 + }, + { + "epoch": 0.01918, + "grad_norm": 0.9797925708116749, + "learning_rate": 0.003, + "loss": 4.2842, + "step": 1918 + }, + { + "epoch": 0.01919, + "grad_norm": 0.7747220152275403, + "learning_rate": 0.003, + "loss": 4.2834, + "step": 1919 + }, + { + "epoch": 0.0192, + "grad_norm": 0.7189857079350012, + "learning_rate": 0.003, + "loss": 4.2819, + "step": 1920 + }, + { + "epoch": 0.01921, + "grad_norm": 0.7219748871978388, + "learning_rate": 0.003, + "loss": 4.2705, + "step": 1921 + }, + { + "epoch": 0.01922, + "grad_norm": 0.7146468956621638, + "learning_rate": 0.003, + "loss": 4.2862, + "step": 1922 + }, + { + "epoch": 0.01923, + "grad_norm": 0.6697224209681402, + "learning_rate": 0.003, + "loss": 4.2923, + "step": 1923 + }, + { + "epoch": 0.01924, + "grad_norm": 0.7063194554901026, + "learning_rate": 0.003, + "loss": 4.2924, + "step": 1924 + }, + { + "epoch": 0.01925, + "grad_norm": 0.6608146496442958, + "learning_rate": 0.003, + "loss": 4.2782, + "step": 1925 + }, + { + "epoch": 0.01926, + "grad_norm": 0.6058286924144092, + "learning_rate": 0.003, + "loss": 4.2871, + "step": 1926 + }, + { + "epoch": 0.01927, + "grad_norm": 0.5582247211706641, + "learning_rate": 0.003, + "loss": 4.2731, + "step": 1927 + }, + { + "epoch": 0.01928, + "grad_norm": 0.5497301592853885, + "learning_rate": 0.003, + "loss": 4.2438, + "step": 1928 + }, + { + "epoch": 0.01929, + "grad_norm": 0.5697976403038864, + "learning_rate": 0.003, + "loss": 4.2661, + "step": 1929 + }, + { + "epoch": 0.0193, + "grad_norm": 0.5669233105595727, + "learning_rate": 0.003, + "loss": 4.253, + "step": 1930 + }, + { + "epoch": 0.01931, + "grad_norm": 0.4328335869920709, + "learning_rate": 0.003, + "loss": 4.2488, + "step": 1931 + }, + { + "epoch": 0.01932, + "grad_norm": 0.37950505638182785, + "learning_rate": 0.003, + "loss": 4.2566, + "step": 1932 + }, + { + "epoch": 0.01933, + "grad_norm": 0.3504964342922692, + "learning_rate": 0.003, + "loss": 4.2408, + "step": 1933 + }, + { + "epoch": 0.01934, + "grad_norm": 0.3616114126450499, + "learning_rate": 0.003, + "loss": 4.2472, + "step": 1934 + }, + { + "epoch": 0.01935, + "grad_norm": 0.3586196056492206, + "learning_rate": 0.003, + "loss": 4.259, + "step": 1935 + }, + { + "epoch": 0.01936, + "grad_norm": 0.4036363099668179, + "learning_rate": 0.003, + "loss": 4.2404, + "step": 1936 + }, + { + "epoch": 0.01937, + "grad_norm": 0.41909297435676146, + "learning_rate": 0.003, + "loss": 4.2527, + "step": 1937 + }, + { + "epoch": 0.01938, + "grad_norm": 0.439340208584005, + "learning_rate": 0.003, + "loss": 4.2528, + "step": 1938 + }, + { + "epoch": 0.01939, + "grad_norm": 0.4942036615177103, + "learning_rate": 0.003, + "loss": 4.2748, + "step": 1939 + }, + { + "epoch": 0.0194, + "grad_norm": 0.5753579784669907, + "learning_rate": 0.003, + "loss": 4.2537, + "step": 1940 + }, + { + "epoch": 0.01941, + "grad_norm": 0.7525206839134082, + "learning_rate": 0.003, + "loss": 4.2429, + "step": 1941 + }, + { + "epoch": 0.01942, + "grad_norm": 0.8127296854745015, + "learning_rate": 0.003, + "loss": 4.2788, + "step": 1942 + }, + { + "epoch": 0.01943, + "grad_norm": 0.7505954536075328, + "learning_rate": 0.003, + "loss": 4.2474, + "step": 1943 + }, + { + "epoch": 0.01944, + "grad_norm": 0.6966879842895444, + "learning_rate": 0.003, + "loss": 4.292, + "step": 1944 + }, + { + "epoch": 0.01945, + "grad_norm": 0.5317620511293283, + "learning_rate": 0.003, + "loss": 4.2513, + "step": 1945 + }, + { + "epoch": 0.01946, + "grad_norm": 0.5396271588936495, + "learning_rate": 0.003, + "loss": 4.2483, + "step": 1946 + }, + { + "epoch": 0.01947, + "grad_norm": 0.4953395916242608, + "learning_rate": 0.003, + "loss": 4.2539, + "step": 1947 + }, + { + "epoch": 0.01948, + "grad_norm": 0.43095282528265144, + "learning_rate": 0.003, + "loss": 4.2724, + "step": 1948 + }, + { + "epoch": 0.01949, + "grad_norm": 0.4581100416309599, + "learning_rate": 0.003, + "loss": 4.2647, + "step": 1949 + }, + { + "epoch": 0.0195, + "grad_norm": 0.4138069104419874, + "learning_rate": 0.003, + "loss": 4.2509, + "step": 1950 + }, + { + "epoch": 0.01951, + "grad_norm": 0.453383618119804, + "learning_rate": 0.003, + "loss": 4.2469, + "step": 1951 + }, + { + "epoch": 0.01952, + "grad_norm": 0.5050716895331453, + "learning_rate": 0.003, + "loss": 4.2541, + "step": 1952 + }, + { + "epoch": 0.01953, + "grad_norm": 0.5444826544170946, + "learning_rate": 0.003, + "loss": 4.2612, + "step": 1953 + }, + { + "epoch": 0.01954, + "grad_norm": 0.5817368201949349, + "learning_rate": 0.003, + "loss": 4.2642, + "step": 1954 + }, + { + "epoch": 0.01955, + "grad_norm": 0.5828427095685109, + "learning_rate": 0.003, + "loss": 4.257, + "step": 1955 + }, + { + "epoch": 0.01956, + "grad_norm": 0.6033336279941961, + "learning_rate": 0.003, + "loss": 4.2559, + "step": 1956 + }, + { + "epoch": 0.01957, + "grad_norm": 0.4961523479078479, + "learning_rate": 0.003, + "loss": 4.2179, + "step": 1957 + }, + { + "epoch": 0.01958, + "grad_norm": 0.44186549075636594, + "learning_rate": 0.003, + "loss": 4.2115, + "step": 1958 + }, + { + "epoch": 0.01959, + "grad_norm": 0.5087237373575997, + "learning_rate": 0.003, + "loss": 4.2394, + "step": 1959 + }, + { + "epoch": 0.0196, + "grad_norm": 0.5262591797086301, + "learning_rate": 0.003, + "loss": 4.2322, + "step": 1960 + }, + { + "epoch": 0.01961, + "grad_norm": 0.68395452085867, + "learning_rate": 0.003, + "loss": 4.2394, + "step": 1961 + }, + { + "epoch": 0.01962, + "grad_norm": 0.8016832589931029, + "learning_rate": 0.003, + "loss": 4.2481, + "step": 1962 + }, + { + "epoch": 0.01963, + "grad_norm": 0.75220900113814, + "learning_rate": 0.003, + "loss": 4.2388, + "step": 1963 + }, + { + "epoch": 0.01964, + "grad_norm": 0.6357936481488424, + "learning_rate": 0.003, + "loss": 4.2611, + "step": 1964 + }, + { + "epoch": 0.01965, + "grad_norm": 0.5451687729946599, + "learning_rate": 0.003, + "loss": 4.2493, + "step": 1965 + }, + { + "epoch": 0.01966, + "grad_norm": 0.5793603889291398, + "learning_rate": 0.003, + "loss": 4.2538, + "step": 1966 + }, + { + "epoch": 0.01967, + "grad_norm": 0.5157681292282557, + "learning_rate": 0.003, + "loss": 4.2253, + "step": 1967 + }, + { + "epoch": 0.01968, + "grad_norm": 0.5440002063072249, + "learning_rate": 0.003, + "loss": 4.2502, + "step": 1968 + }, + { + "epoch": 0.01969, + "grad_norm": 0.5300695855996388, + "learning_rate": 0.003, + "loss": 4.2446, + "step": 1969 + }, + { + "epoch": 0.0197, + "grad_norm": 0.47595129333192576, + "learning_rate": 0.003, + "loss": 4.2426, + "step": 1970 + }, + { + "epoch": 0.01971, + "grad_norm": 0.5285948280548484, + "learning_rate": 0.003, + "loss": 4.2314, + "step": 1971 + }, + { + "epoch": 0.01972, + "grad_norm": 0.5935023814870325, + "learning_rate": 0.003, + "loss": 4.2369, + "step": 1972 + }, + { + "epoch": 0.01973, + "grad_norm": 0.6689295429120606, + "learning_rate": 0.003, + "loss": 4.2228, + "step": 1973 + }, + { + "epoch": 0.01974, + "grad_norm": 0.6805455931144389, + "learning_rate": 0.003, + "loss": 4.2627, + "step": 1974 + }, + { + "epoch": 0.01975, + "grad_norm": 0.6851649356245609, + "learning_rate": 0.003, + "loss": 4.2574, + "step": 1975 + }, + { + "epoch": 0.01976, + "grad_norm": 0.6359068486728622, + "learning_rate": 0.003, + "loss": 4.2445, + "step": 1976 + }, + { + "epoch": 0.01977, + "grad_norm": 0.5752983961700786, + "learning_rate": 0.003, + "loss": 4.2775, + "step": 1977 + }, + { + "epoch": 0.01978, + "grad_norm": 0.5885328424294697, + "learning_rate": 0.003, + "loss": 4.2773, + "step": 1978 + }, + { + "epoch": 0.01979, + "grad_norm": 0.6114820249853535, + "learning_rate": 0.003, + "loss": 4.266, + "step": 1979 + }, + { + "epoch": 0.0198, + "grad_norm": 0.6725244919020471, + "learning_rate": 0.003, + "loss": 4.2495, + "step": 1980 + }, + { + "epoch": 0.01981, + "grad_norm": 0.6899410283984171, + "learning_rate": 0.003, + "loss": 4.2347, + "step": 1981 + }, + { + "epoch": 0.01982, + "grad_norm": 0.7680678867968328, + "learning_rate": 0.003, + "loss": 4.2688, + "step": 1982 + }, + { + "epoch": 0.01983, + "grad_norm": 0.8666192600294306, + "learning_rate": 0.003, + "loss": 4.2665, + "step": 1983 + }, + { + "epoch": 0.01984, + "grad_norm": 0.7590492652886811, + "learning_rate": 0.003, + "loss": 4.2796, + "step": 1984 + }, + { + "epoch": 0.01985, + "grad_norm": 0.6539060914690586, + "learning_rate": 0.003, + "loss": 4.2645, + "step": 1985 + }, + { + "epoch": 0.01986, + "grad_norm": 0.715503037740265, + "learning_rate": 0.003, + "loss": 4.2958, + "step": 1986 + }, + { + "epoch": 0.01987, + "grad_norm": 0.7397978022844682, + "learning_rate": 0.003, + "loss": 4.254, + "step": 1987 + }, + { + "epoch": 0.01988, + "grad_norm": 0.7632034859099814, + "learning_rate": 0.003, + "loss": 4.2717, + "step": 1988 + }, + { + "epoch": 0.01989, + "grad_norm": 0.7294621095424992, + "learning_rate": 0.003, + "loss": 4.2599, + "step": 1989 + }, + { + "epoch": 0.0199, + "grad_norm": 0.7424352842692145, + "learning_rate": 0.003, + "loss": 4.274, + "step": 1990 + }, + { + "epoch": 0.01991, + "grad_norm": 0.6805432395640066, + "learning_rate": 0.003, + "loss": 4.2481, + "step": 1991 + }, + { + "epoch": 0.01992, + "grad_norm": 0.6518823852851582, + "learning_rate": 0.003, + "loss": 4.2906, + "step": 1992 + }, + { + "epoch": 0.01993, + "grad_norm": 0.5385914746778909, + "learning_rate": 0.003, + "loss": 4.2663, + "step": 1993 + }, + { + "epoch": 0.01994, + "grad_norm": 0.5600374553588182, + "learning_rate": 0.003, + "loss": 4.2843, + "step": 1994 + }, + { + "epoch": 0.01995, + "grad_norm": 0.5777853734403589, + "learning_rate": 0.003, + "loss": 4.2772, + "step": 1995 + }, + { + "epoch": 0.01996, + "grad_norm": 0.5901601073427315, + "learning_rate": 0.003, + "loss": 4.2466, + "step": 1996 + }, + { + "epoch": 0.01997, + "grad_norm": 0.5583731211765088, + "learning_rate": 0.003, + "loss": 4.2745, + "step": 1997 + }, + { + "epoch": 0.01998, + "grad_norm": 0.5499515734950207, + "learning_rate": 0.003, + "loss": 4.2592, + "step": 1998 + }, + { + "epoch": 0.01999, + "grad_norm": 0.5390040915979398, + "learning_rate": 0.003, + "loss": 4.2488, + "step": 1999 + }, + { + "epoch": 0.02, + "grad_norm": 0.6085613854495133, + "learning_rate": 0.003, + "loss": 4.2357, + "step": 2000 + }, + { + "epoch": 0.02001, + "grad_norm": 0.6642300277845461, + "learning_rate": 0.003, + "loss": 4.2761, + "step": 2001 + }, + { + "epoch": 0.02002, + "grad_norm": 0.6718563957297456, + "learning_rate": 0.003, + "loss": 4.2714, + "step": 2002 + }, + { + "epoch": 0.02003, + "grad_norm": 0.5656785141879644, + "learning_rate": 0.003, + "loss": 4.2645, + "step": 2003 + }, + { + "epoch": 0.02004, + "grad_norm": 0.5079141091367905, + "learning_rate": 0.003, + "loss": 4.2507, + "step": 2004 + }, + { + "epoch": 0.02005, + "grad_norm": 0.5217306980533027, + "learning_rate": 0.003, + "loss": 4.254, + "step": 2005 + }, + { + "epoch": 0.02006, + "grad_norm": 0.5701023982360506, + "learning_rate": 0.003, + "loss": 4.2527, + "step": 2006 + }, + { + "epoch": 0.02007, + "grad_norm": 0.5456537194359483, + "learning_rate": 0.003, + "loss": 4.2671, + "step": 2007 + }, + { + "epoch": 0.02008, + "grad_norm": 0.4424819860056267, + "learning_rate": 0.003, + "loss": 4.2538, + "step": 2008 + }, + { + "epoch": 0.02009, + "grad_norm": 0.4167499120193345, + "learning_rate": 0.003, + "loss": 4.2487, + "step": 2009 + }, + { + "epoch": 0.0201, + "grad_norm": 0.4192128686531188, + "learning_rate": 0.003, + "loss": 4.259, + "step": 2010 + }, + { + "epoch": 0.02011, + "grad_norm": 0.3848165653499132, + "learning_rate": 0.003, + "loss": 4.2447, + "step": 2011 + }, + { + "epoch": 0.02012, + "grad_norm": 0.433602941803945, + "learning_rate": 0.003, + "loss": 4.2428, + "step": 2012 + }, + { + "epoch": 0.02013, + "grad_norm": 0.4888730590149885, + "learning_rate": 0.003, + "loss": 4.2535, + "step": 2013 + }, + { + "epoch": 0.02014, + "grad_norm": 0.5673912976929151, + "learning_rate": 0.003, + "loss": 4.2387, + "step": 2014 + }, + { + "epoch": 0.02015, + "grad_norm": 0.5541499948622309, + "learning_rate": 0.003, + "loss": 4.2361, + "step": 2015 + }, + { + "epoch": 0.02016, + "grad_norm": 0.5126326745775238, + "learning_rate": 0.003, + "loss": 4.2417, + "step": 2016 + }, + { + "epoch": 0.02017, + "grad_norm": 0.4179261623064346, + "learning_rate": 0.003, + "loss": 4.2316, + "step": 2017 + }, + { + "epoch": 0.02018, + "grad_norm": 0.3898056312044461, + "learning_rate": 0.003, + "loss": 4.2341, + "step": 2018 + }, + { + "epoch": 0.02019, + "grad_norm": 0.3954814595294976, + "learning_rate": 0.003, + "loss": 4.2373, + "step": 2019 + }, + { + "epoch": 0.0202, + "grad_norm": 0.4213180029076697, + "learning_rate": 0.003, + "loss": 4.2455, + "step": 2020 + }, + { + "epoch": 0.02021, + "grad_norm": 0.46415695965874665, + "learning_rate": 0.003, + "loss": 4.2601, + "step": 2021 + }, + { + "epoch": 0.02022, + "grad_norm": 0.5255015046967251, + "learning_rate": 0.003, + "loss": 4.2204, + "step": 2022 + }, + { + "epoch": 0.02023, + "grad_norm": 0.494786978730351, + "learning_rate": 0.003, + "loss": 4.1939, + "step": 2023 + }, + { + "epoch": 0.02024, + "grad_norm": 0.4706750110587683, + "learning_rate": 0.003, + "loss": 4.2183, + "step": 2024 + }, + { + "epoch": 0.02025, + "grad_norm": 0.4845475406638408, + "learning_rate": 0.003, + "loss": 4.2698, + "step": 2025 + }, + { + "epoch": 0.02026, + "grad_norm": 0.5870205306653891, + "learning_rate": 0.003, + "loss": 4.2539, + "step": 2026 + }, + { + "epoch": 0.02027, + "grad_norm": 0.5994657871324464, + "learning_rate": 0.003, + "loss": 4.2502, + "step": 2027 + }, + { + "epoch": 0.02028, + "grad_norm": 0.5969136086797949, + "learning_rate": 0.003, + "loss": 4.2336, + "step": 2028 + }, + { + "epoch": 0.02029, + "grad_norm": 0.6010561087024289, + "learning_rate": 0.003, + "loss": 4.2419, + "step": 2029 + }, + { + "epoch": 0.0203, + "grad_norm": 0.6795949256132597, + "learning_rate": 0.003, + "loss": 4.2385, + "step": 2030 + }, + { + "epoch": 0.02031, + "grad_norm": 0.8760173401758673, + "learning_rate": 0.003, + "loss": 4.2538, + "step": 2031 + }, + { + "epoch": 0.02032, + "grad_norm": 0.9553317966985612, + "learning_rate": 0.003, + "loss": 4.2369, + "step": 2032 + }, + { + "epoch": 0.02033, + "grad_norm": 0.8062059561231115, + "learning_rate": 0.003, + "loss": 4.2804, + "step": 2033 + }, + { + "epoch": 0.02034, + "grad_norm": 0.6544594771524366, + "learning_rate": 0.003, + "loss": 4.2639, + "step": 2034 + }, + { + "epoch": 0.02035, + "grad_norm": 0.5631710363420367, + "learning_rate": 0.003, + "loss": 4.2451, + "step": 2035 + }, + { + "epoch": 0.02036, + "grad_norm": 0.5975164123168384, + "learning_rate": 0.003, + "loss": 4.2483, + "step": 2036 + }, + { + "epoch": 0.02037, + "grad_norm": 0.5354587611248767, + "learning_rate": 0.003, + "loss": 4.2582, + "step": 2037 + }, + { + "epoch": 0.02038, + "grad_norm": 0.421237418399427, + "learning_rate": 0.003, + "loss": 4.2353, + "step": 2038 + }, + { + "epoch": 0.02039, + "grad_norm": 0.4274320650935603, + "learning_rate": 0.003, + "loss": 4.2464, + "step": 2039 + }, + { + "epoch": 0.0204, + "grad_norm": 0.4164176001256727, + "learning_rate": 0.003, + "loss": 4.2496, + "step": 2040 + }, + { + "epoch": 0.02041, + "grad_norm": 0.4311810995214273, + "learning_rate": 0.003, + "loss": 4.2107, + "step": 2041 + }, + { + "epoch": 0.02042, + "grad_norm": 0.47918244910234625, + "learning_rate": 0.003, + "loss": 4.2125, + "step": 2042 + }, + { + "epoch": 0.02043, + "grad_norm": 0.5311186564484092, + "learning_rate": 0.003, + "loss": 4.2587, + "step": 2043 + }, + { + "epoch": 0.02044, + "grad_norm": 0.5775700762927194, + "learning_rate": 0.003, + "loss": 4.2349, + "step": 2044 + }, + { + "epoch": 0.02045, + "grad_norm": 0.64581468780677, + "learning_rate": 0.003, + "loss": 4.2514, + "step": 2045 + }, + { + "epoch": 0.02046, + "grad_norm": 0.6099280998923168, + "learning_rate": 0.003, + "loss": 4.2295, + "step": 2046 + }, + { + "epoch": 0.02047, + "grad_norm": 0.49206984816447724, + "learning_rate": 0.003, + "loss": 4.2619, + "step": 2047 + }, + { + "epoch": 0.02048, + "grad_norm": 0.6576416076375511, + "learning_rate": 0.003, + "loss": 4.2493, + "step": 2048 + }, + { + "epoch": 0.02049, + "grad_norm": 0.720911622296069, + "learning_rate": 0.003, + "loss": 4.2776, + "step": 2049 + }, + { + "epoch": 0.0205, + "grad_norm": 0.6221494156113543, + "learning_rate": 0.003, + "loss": 4.2218, + "step": 2050 + }, + { + "epoch": 0.02051, + "grad_norm": 0.5686539010207549, + "learning_rate": 0.003, + "loss": 4.2223, + "step": 2051 + }, + { + "epoch": 0.02052, + "grad_norm": 0.5121857318353746, + "learning_rate": 0.003, + "loss": 4.2222, + "step": 2052 + }, + { + "epoch": 0.02053, + "grad_norm": 0.4272252497200639, + "learning_rate": 0.003, + "loss": 4.2194, + "step": 2053 + }, + { + "epoch": 0.02054, + "grad_norm": 0.4939477792323304, + "learning_rate": 0.003, + "loss": 4.2591, + "step": 2054 + }, + { + "epoch": 0.02055, + "grad_norm": 0.5546078566627058, + "learning_rate": 0.003, + "loss": 4.2201, + "step": 2055 + }, + { + "epoch": 0.02056, + "grad_norm": 0.6366201965593732, + "learning_rate": 0.003, + "loss": 4.2231, + "step": 2056 + }, + { + "epoch": 0.02057, + "grad_norm": 0.6110086842051798, + "learning_rate": 0.003, + "loss": 4.2608, + "step": 2057 + }, + { + "epoch": 0.02058, + "grad_norm": 0.6217334659745377, + "learning_rate": 0.003, + "loss": 4.2367, + "step": 2058 + }, + { + "epoch": 0.02059, + "grad_norm": 0.651819516754762, + "learning_rate": 0.003, + "loss": 4.222, + "step": 2059 + }, + { + "epoch": 0.0206, + "grad_norm": 0.5754276638558378, + "learning_rate": 0.003, + "loss": 4.2638, + "step": 2060 + }, + { + "epoch": 0.02061, + "grad_norm": 0.5646246506114238, + "learning_rate": 0.003, + "loss": 4.2372, + "step": 2061 + }, + { + "epoch": 0.02062, + "grad_norm": 0.5560224904172448, + "learning_rate": 0.003, + "loss": 4.2373, + "step": 2062 + }, + { + "epoch": 0.02063, + "grad_norm": 0.6303655478175813, + "learning_rate": 0.003, + "loss": 4.2352, + "step": 2063 + }, + { + "epoch": 0.02064, + "grad_norm": 0.7300283957670998, + "learning_rate": 0.003, + "loss": 4.2505, + "step": 2064 + }, + { + "epoch": 0.02065, + "grad_norm": 0.7574662177371257, + "learning_rate": 0.003, + "loss": 4.2359, + "step": 2065 + }, + { + "epoch": 0.02066, + "grad_norm": 0.7411303035814747, + "learning_rate": 0.003, + "loss": 4.2531, + "step": 2066 + }, + { + "epoch": 0.02067, + "grad_norm": 0.650869346843019, + "learning_rate": 0.003, + "loss": 4.2447, + "step": 2067 + }, + { + "epoch": 0.02068, + "grad_norm": 0.6552127487874416, + "learning_rate": 0.003, + "loss": 4.2306, + "step": 2068 + }, + { + "epoch": 0.02069, + "grad_norm": 0.592143853498531, + "learning_rate": 0.003, + "loss": 4.2516, + "step": 2069 + }, + { + "epoch": 0.0207, + "grad_norm": 0.6207350896242446, + "learning_rate": 0.003, + "loss": 4.2263, + "step": 2070 + }, + { + "epoch": 0.02071, + "grad_norm": 0.5661720697139818, + "learning_rate": 0.003, + "loss": 4.2543, + "step": 2071 + }, + { + "epoch": 0.02072, + "grad_norm": 0.5613150829964545, + "learning_rate": 0.003, + "loss": 4.2589, + "step": 2072 + }, + { + "epoch": 0.02073, + "grad_norm": 0.5913403223082936, + "learning_rate": 0.003, + "loss": 4.24, + "step": 2073 + }, + { + "epoch": 0.02074, + "grad_norm": 0.5851338988165525, + "learning_rate": 0.003, + "loss": 4.2458, + "step": 2074 + }, + { + "epoch": 0.02075, + "grad_norm": 0.6236167441400716, + "learning_rate": 0.003, + "loss": 4.2754, + "step": 2075 + }, + { + "epoch": 0.02076, + "grad_norm": 0.6671767173763447, + "learning_rate": 0.003, + "loss": 4.2696, + "step": 2076 + }, + { + "epoch": 0.02077, + "grad_norm": 0.7466794061160641, + "learning_rate": 0.003, + "loss": 4.2397, + "step": 2077 + }, + { + "epoch": 0.02078, + "grad_norm": 0.7293207627425712, + "learning_rate": 0.003, + "loss": 4.2415, + "step": 2078 + }, + { + "epoch": 0.02079, + "grad_norm": 0.7479234412446394, + "learning_rate": 0.003, + "loss": 4.2267, + "step": 2079 + }, + { + "epoch": 0.0208, + "grad_norm": 0.8408782832743513, + "learning_rate": 0.003, + "loss": 4.2466, + "step": 2080 + }, + { + "epoch": 0.02081, + "grad_norm": 0.6858851728314246, + "learning_rate": 0.003, + "loss": 4.2727, + "step": 2081 + }, + { + "epoch": 0.02082, + "grad_norm": 0.5964291376338854, + "learning_rate": 0.003, + "loss": 4.2638, + "step": 2082 + }, + { + "epoch": 0.02083, + "grad_norm": 0.6428002164481011, + "learning_rate": 0.003, + "loss": 4.2628, + "step": 2083 + }, + { + "epoch": 0.02084, + "grad_norm": 0.5590580963015779, + "learning_rate": 0.003, + "loss": 4.2545, + "step": 2084 + }, + { + "epoch": 0.02085, + "grad_norm": 0.5709511558813687, + "learning_rate": 0.003, + "loss": 4.2275, + "step": 2085 + }, + { + "epoch": 0.02086, + "grad_norm": 0.49147509133747364, + "learning_rate": 0.003, + "loss": 4.2297, + "step": 2086 + }, + { + "epoch": 0.02087, + "grad_norm": 0.4526072737335104, + "learning_rate": 0.003, + "loss": 4.2294, + "step": 2087 + }, + { + "epoch": 0.02088, + "grad_norm": 0.4016974716447893, + "learning_rate": 0.003, + "loss": 4.2602, + "step": 2088 + }, + { + "epoch": 0.02089, + "grad_norm": 0.3822543479885703, + "learning_rate": 0.003, + "loss": 4.2249, + "step": 2089 + }, + { + "epoch": 0.0209, + "grad_norm": 0.36830581223819503, + "learning_rate": 0.003, + "loss": 4.2427, + "step": 2090 + }, + { + "epoch": 0.02091, + "grad_norm": 0.35577804274859626, + "learning_rate": 0.003, + "loss": 4.2285, + "step": 2091 + }, + { + "epoch": 0.02092, + "grad_norm": 0.4347176959080428, + "learning_rate": 0.003, + "loss": 4.2276, + "step": 2092 + }, + { + "epoch": 0.02093, + "grad_norm": 0.5304539612640176, + "learning_rate": 0.003, + "loss": 4.1974, + "step": 2093 + }, + { + "epoch": 0.02094, + "grad_norm": 0.7330731731573918, + "learning_rate": 0.003, + "loss": 4.2123, + "step": 2094 + }, + { + "epoch": 0.02095, + "grad_norm": 0.7847225937930644, + "learning_rate": 0.003, + "loss": 4.2645, + "step": 2095 + }, + { + "epoch": 0.02096, + "grad_norm": 0.6648124367791377, + "learning_rate": 0.003, + "loss": 4.2516, + "step": 2096 + }, + { + "epoch": 0.02097, + "grad_norm": 0.5924239660138091, + "learning_rate": 0.003, + "loss": 4.2279, + "step": 2097 + }, + { + "epoch": 0.02098, + "grad_norm": 0.5602628841134751, + "learning_rate": 0.003, + "loss": 4.2465, + "step": 2098 + }, + { + "epoch": 0.02099, + "grad_norm": 0.46052580380046326, + "learning_rate": 0.003, + "loss": 4.2512, + "step": 2099 + }, + { + "epoch": 0.021, + "grad_norm": 0.4339841503760461, + "learning_rate": 0.003, + "loss": 4.263, + "step": 2100 + }, + { + "epoch": 0.02101, + "grad_norm": 0.3839438876111581, + "learning_rate": 0.003, + "loss": 4.2511, + "step": 2101 + }, + { + "epoch": 0.02102, + "grad_norm": 0.3754480167103612, + "learning_rate": 0.003, + "loss": 4.2531, + "step": 2102 + }, + { + "epoch": 0.02103, + "grad_norm": 0.4162207299096809, + "learning_rate": 0.003, + "loss": 4.2151, + "step": 2103 + }, + { + "epoch": 0.02104, + "grad_norm": 0.46199291536196674, + "learning_rate": 0.003, + "loss": 4.2197, + "step": 2104 + }, + { + "epoch": 0.02105, + "grad_norm": 0.5379532841395008, + "learning_rate": 0.003, + "loss": 4.2264, + "step": 2105 + }, + { + "epoch": 0.02106, + "grad_norm": 0.5254543741501657, + "learning_rate": 0.003, + "loss": 4.2187, + "step": 2106 + }, + { + "epoch": 0.02107, + "grad_norm": 0.5366355457801288, + "learning_rate": 0.003, + "loss": 4.2328, + "step": 2107 + }, + { + "epoch": 0.02108, + "grad_norm": 0.6703887406069353, + "learning_rate": 0.003, + "loss": 4.2182, + "step": 2108 + }, + { + "epoch": 0.02109, + "grad_norm": 0.73797667148783, + "learning_rate": 0.003, + "loss": 4.2096, + "step": 2109 + }, + { + "epoch": 0.0211, + "grad_norm": 0.7306602624016222, + "learning_rate": 0.003, + "loss": 4.2289, + "step": 2110 + }, + { + "epoch": 0.02111, + "grad_norm": 0.7062809207206387, + "learning_rate": 0.003, + "loss": 4.2409, + "step": 2111 + }, + { + "epoch": 0.02112, + "grad_norm": 0.6626082871730016, + "learning_rate": 0.003, + "loss": 4.2378, + "step": 2112 + }, + { + "epoch": 0.02113, + "grad_norm": 0.7157807873847553, + "learning_rate": 0.003, + "loss": 4.238, + "step": 2113 + }, + { + "epoch": 0.02114, + "grad_norm": 0.7509167267520314, + "learning_rate": 0.003, + "loss": 4.2327, + "step": 2114 + }, + { + "epoch": 0.02115, + "grad_norm": 0.8631295846669226, + "learning_rate": 0.003, + "loss": 4.234, + "step": 2115 + }, + { + "epoch": 0.02116, + "grad_norm": 0.8818862519491879, + "learning_rate": 0.003, + "loss": 4.2687, + "step": 2116 + }, + { + "epoch": 0.02117, + "grad_norm": 0.9901874745713439, + "learning_rate": 0.003, + "loss": 4.2757, + "step": 2117 + }, + { + "epoch": 0.02118, + "grad_norm": 1.0129584193183192, + "learning_rate": 0.003, + "loss": 4.28, + "step": 2118 + }, + { + "epoch": 0.02119, + "grad_norm": 1.0428306836946426, + "learning_rate": 0.003, + "loss": 4.2999, + "step": 2119 + }, + { + "epoch": 0.0212, + "grad_norm": 0.8765906436767104, + "learning_rate": 0.003, + "loss": 4.2719, + "step": 2120 + }, + { + "epoch": 0.02121, + "grad_norm": 0.7948828861009111, + "learning_rate": 0.003, + "loss": 4.2627, + "step": 2121 + }, + { + "epoch": 0.02122, + "grad_norm": 0.7472865629342507, + "learning_rate": 0.003, + "loss": 4.2726, + "step": 2122 + }, + { + "epoch": 0.02123, + "grad_norm": 0.6385709948713884, + "learning_rate": 0.003, + "loss": 4.2438, + "step": 2123 + }, + { + "epoch": 0.02124, + "grad_norm": 0.557924731350531, + "learning_rate": 0.003, + "loss": 4.2611, + "step": 2124 + }, + { + "epoch": 0.02125, + "grad_norm": 0.5786579211625313, + "learning_rate": 0.003, + "loss": 4.2778, + "step": 2125 + }, + { + "epoch": 0.02126, + "grad_norm": 0.6027087958052927, + "learning_rate": 0.003, + "loss": 4.2718, + "step": 2126 + }, + { + "epoch": 0.02127, + "grad_norm": 0.5438113788648495, + "learning_rate": 0.003, + "loss": 4.2756, + "step": 2127 + }, + { + "epoch": 0.02128, + "grad_norm": 0.4557902210786888, + "learning_rate": 0.003, + "loss": 4.261, + "step": 2128 + }, + { + "epoch": 0.02129, + "grad_norm": 0.43397893502328583, + "learning_rate": 0.003, + "loss": 4.259, + "step": 2129 + }, + { + "epoch": 0.0213, + "grad_norm": 0.4555779798497891, + "learning_rate": 0.003, + "loss": 4.233, + "step": 2130 + }, + { + "epoch": 0.02131, + "grad_norm": 0.4779110278336917, + "learning_rate": 0.003, + "loss": 4.2569, + "step": 2131 + }, + { + "epoch": 0.02132, + "grad_norm": 0.5370327391448694, + "learning_rate": 0.003, + "loss": 4.2673, + "step": 2132 + }, + { + "epoch": 0.02133, + "grad_norm": 0.5952731573509253, + "learning_rate": 0.003, + "loss": 4.2603, + "step": 2133 + }, + { + "epoch": 0.02134, + "grad_norm": 0.570127740822763, + "learning_rate": 0.003, + "loss": 4.2602, + "step": 2134 + }, + { + "epoch": 0.02135, + "grad_norm": 0.4582370027457041, + "learning_rate": 0.003, + "loss": 4.2533, + "step": 2135 + }, + { + "epoch": 0.02136, + "grad_norm": 0.36641117706775855, + "learning_rate": 0.003, + "loss": 4.219, + "step": 2136 + }, + { + "epoch": 0.02137, + "grad_norm": 0.34800353584932325, + "learning_rate": 0.003, + "loss": 4.2578, + "step": 2137 + }, + { + "epoch": 0.02138, + "grad_norm": 0.30837616213325136, + "learning_rate": 0.003, + "loss": 4.2635, + "step": 2138 + }, + { + "epoch": 0.02139, + "grad_norm": 0.3564245359330032, + "learning_rate": 0.003, + "loss": 4.2543, + "step": 2139 + }, + { + "epoch": 0.0214, + "grad_norm": 0.35935426429256007, + "learning_rate": 0.003, + "loss": 4.2457, + "step": 2140 + }, + { + "epoch": 0.02141, + "grad_norm": 0.44450585444098134, + "learning_rate": 0.003, + "loss": 4.2278, + "step": 2141 + }, + { + "epoch": 0.02142, + "grad_norm": 0.5664462228803099, + "learning_rate": 0.003, + "loss": 4.2232, + "step": 2142 + }, + { + "epoch": 0.02143, + "grad_norm": 0.6979889941355174, + "learning_rate": 0.003, + "loss": 4.2444, + "step": 2143 + }, + { + "epoch": 0.02144, + "grad_norm": 0.6079500902139742, + "learning_rate": 0.003, + "loss": 4.2463, + "step": 2144 + }, + { + "epoch": 0.02145, + "grad_norm": 0.4705352279704145, + "learning_rate": 0.003, + "loss": 4.2234, + "step": 2145 + }, + { + "epoch": 0.02146, + "grad_norm": 0.6487033829979426, + "learning_rate": 0.003, + "loss": 4.242, + "step": 2146 + }, + { + "epoch": 0.02147, + "grad_norm": 0.646803097358189, + "learning_rate": 0.003, + "loss": 4.2643, + "step": 2147 + }, + { + "epoch": 0.02148, + "grad_norm": 0.5846288473404795, + "learning_rate": 0.003, + "loss": 4.2698, + "step": 2148 + }, + { + "epoch": 0.02149, + "grad_norm": 0.6324004546408943, + "learning_rate": 0.003, + "loss": 4.2592, + "step": 2149 + }, + { + "epoch": 0.0215, + "grad_norm": 0.6348968665767296, + "learning_rate": 0.003, + "loss": 4.2447, + "step": 2150 + }, + { + "epoch": 0.02151, + "grad_norm": 0.6038850039685164, + "learning_rate": 0.003, + "loss": 4.2496, + "step": 2151 + }, + { + "epoch": 0.02152, + "grad_norm": 0.6571231105206827, + "learning_rate": 0.003, + "loss": 4.229, + "step": 2152 + }, + { + "epoch": 0.02153, + "grad_norm": 0.5810533963397566, + "learning_rate": 0.003, + "loss": 4.2363, + "step": 2153 + }, + { + "epoch": 0.02154, + "grad_norm": 0.4956565749223054, + "learning_rate": 0.003, + "loss": 4.2226, + "step": 2154 + }, + { + "epoch": 0.02155, + "grad_norm": 0.437847806481028, + "learning_rate": 0.003, + "loss": 4.252, + "step": 2155 + }, + { + "epoch": 0.02156, + "grad_norm": 0.4180405852162412, + "learning_rate": 0.003, + "loss": 4.2371, + "step": 2156 + }, + { + "epoch": 0.02157, + "grad_norm": 0.3760963967155856, + "learning_rate": 0.003, + "loss": 4.2006, + "step": 2157 + }, + { + "epoch": 0.02158, + "grad_norm": 0.428171131535246, + "learning_rate": 0.003, + "loss": 4.2226, + "step": 2158 + }, + { + "epoch": 0.02159, + "grad_norm": 0.4821995287398208, + "learning_rate": 0.003, + "loss": 4.2389, + "step": 2159 + }, + { + "epoch": 0.0216, + "grad_norm": 0.6037348014921639, + "learning_rate": 0.003, + "loss": 4.2444, + "step": 2160 + }, + { + "epoch": 0.02161, + "grad_norm": 0.6609447933471442, + "learning_rate": 0.003, + "loss": 4.2362, + "step": 2161 + }, + { + "epoch": 0.02162, + "grad_norm": 0.6433252356071486, + "learning_rate": 0.003, + "loss": 4.2396, + "step": 2162 + }, + { + "epoch": 0.02163, + "grad_norm": 0.5709983086388406, + "learning_rate": 0.003, + "loss": 4.2528, + "step": 2163 + }, + { + "epoch": 0.02164, + "grad_norm": 0.6142803039373849, + "learning_rate": 0.003, + "loss": 4.2494, + "step": 2164 + }, + { + "epoch": 0.02165, + "grad_norm": 0.5872422750407742, + "learning_rate": 0.003, + "loss": 4.2104, + "step": 2165 + }, + { + "epoch": 0.02166, + "grad_norm": 0.5895046030386504, + "learning_rate": 0.003, + "loss": 4.2544, + "step": 2166 + }, + { + "epoch": 0.02167, + "grad_norm": 0.5695409441331826, + "learning_rate": 0.003, + "loss": 4.2313, + "step": 2167 + }, + { + "epoch": 0.02168, + "grad_norm": 0.5621779951550028, + "learning_rate": 0.003, + "loss": 4.2285, + "step": 2168 + }, + { + "epoch": 0.02169, + "grad_norm": 0.4890626633165933, + "learning_rate": 0.003, + "loss": 4.2251, + "step": 2169 + }, + { + "epoch": 0.0217, + "grad_norm": 0.5684142074498969, + "learning_rate": 0.003, + "loss": 4.2378, + "step": 2170 + }, + { + "epoch": 0.02171, + "grad_norm": 0.6068809167714128, + "learning_rate": 0.003, + "loss": 4.2505, + "step": 2171 + }, + { + "epoch": 0.02172, + "grad_norm": 0.5617111304615819, + "learning_rate": 0.003, + "loss": 4.2228, + "step": 2172 + }, + { + "epoch": 0.02173, + "grad_norm": 0.5652162783140336, + "learning_rate": 0.003, + "loss": 4.2246, + "step": 2173 + }, + { + "epoch": 0.02174, + "grad_norm": 0.6450142330375198, + "learning_rate": 0.003, + "loss": 4.2267, + "step": 2174 + }, + { + "epoch": 0.02175, + "grad_norm": 0.6696169755702878, + "learning_rate": 0.003, + "loss": 4.2515, + "step": 2175 + }, + { + "epoch": 0.02176, + "grad_norm": 0.6614934153481808, + "learning_rate": 0.003, + "loss": 4.2347, + "step": 2176 + }, + { + "epoch": 0.02177, + "grad_norm": 0.7712758358266676, + "learning_rate": 0.003, + "loss": 4.2426, + "step": 2177 + }, + { + "epoch": 0.02178, + "grad_norm": 0.8052241964952243, + "learning_rate": 0.003, + "loss": 4.234, + "step": 2178 + }, + { + "epoch": 0.02179, + "grad_norm": 0.8190327773882939, + "learning_rate": 0.003, + "loss": 4.2474, + "step": 2179 + }, + { + "epoch": 0.0218, + "grad_norm": 0.7633412782608947, + "learning_rate": 0.003, + "loss": 4.2656, + "step": 2180 + }, + { + "epoch": 0.02181, + "grad_norm": 0.6487141963532598, + "learning_rate": 0.003, + "loss": 4.2624, + "step": 2181 + }, + { + "epoch": 0.02182, + "grad_norm": 0.5938256063465354, + "learning_rate": 0.003, + "loss": 4.2443, + "step": 2182 + }, + { + "epoch": 0.02183, + "grad_norm": 0.572941959470127, + "learning_rate": 0.003, + "loss": 4.2546, + "step": 2183 + }, + { + "epoch": 0.02184, + "grad_norm": 0.5472840158022778, + "learning_rate": 0.003, + "loss": 4.2258, + "step": 2184 + }, + { + "epoch": 0.02185, + "grad_norm": 0.48428509908133466, + "learning_rate": 0.003, + "loss": 4.2187, + "step": 2185 + }, + { + "epoch": 0.02186, + "grad_norm": 0.41644305586561753, + "learning_rate": 0.003, + "loss": 4.2353, + "step": 2186 + }, + { + "epoch": 0.02187, + "grad_norm": 0.4220079190945348, + "learning_rate": 0.003, + "loss": 4.2314, + "step": 2187 + }, + { + "epoch": 0.02188, + "grad_norm": 0.4040099253154041, + "learning_rate": 0.003, + "loss": 4.2084, + "step": 2188 + }, + { + "epoch": 0.02189, + "grad_norm": 0.384791417229758, + "learning_rate": 0.003, + "loss": 4.2412, + "step": 2189 + }, + { + "epoch": 0.0219, + "grad_norm": 0.4200536352754627, + "learning_rate": 0.003, + "loss": 4.2271, + "step": 2190 + }, + { + "epoch": 0.02191, + "grad_norm": 0.4219256289842046, + "learning_rate": 0.003, + "loss": 4.2197, + "step": 2191 + }, + { + "epoch": 0.02192, + "grad_norm": 0.3796661113601938, + "learning_rate": 0.003, + "loss": 4.2468, + "step": 2192 + }, + { + "epoch": 0.02193, + "grad_norm": 0.3951332402646691, + "learning_rate": 0.003, + "loss": 4.2124, + "step": 2193 + }, + { + "epoch": 0.02194, + "grad_norm": 0.43785643693424503, + "learning_rate": 0.003, + "loss": 4.2632, + "step": 2194 + }, + { + "epoch": 0.02195, + "grad_norm": 0.40101316917194174, + "learning_rate": 0.003, + "loss": 4.2286, + "step": 2195 + }, + { + "epoch": 0.02196, + "grad_norm": 0.3895866783766733, + "learning_rate": 0.003, + "loss": 4.2337, + "step": 2196 + }, + { + "epoch": 0.02197, + "grad_norm": 0.43978912346203514, + "learning_rate": 0.003, + "loss": 4.2283, + "step": 2197 + }, + { + "epoch": 0.02198, + "grad_norm": 0.5128410451451384, + "learning_rate": 0.003, + "loss": 4.2199, + "step": 2198 + }, + { + "epoch": 0.02199, + "grad_norm": 0.6337030003238012, + "learning_rate": 0.003, + "loss": 4.192, + "step": 2199 + }, + { + "epoch": 0.022, + "grad_norm": 0.8998218740769401, + "learning_rate": 0.003, + "loss": 4.2327, + "step": 2200 + }, + { + "epoch": 0.02201, + "grad_norm": 0.9431446506923681, + "learning_rate": 0.003, + "loss": 4.266, + "step": 2201 + }, + { + "epoch": 0.02202, + "grad_norm": 0.7612219282493126, + "learning_rate": 0.003, + "loss": 4.2433, + "step": 2202 + }, + { + "epoch": 0.02203, + "grad_norm": 0.6635684162527457, + "learning_rate": 0.003, + "loss": 4.2702, + "step": 2203 + }, + { + "epoch": 0.02204, + "grad_norm": 0.7621929732197255, + "learning_rate": 0.003, + "loss": 4.2688, + "step": 2204 + }, + { + "epoch": 0.02205, + "grad_norm": 0.8741388018917097, + "learning_rate": 0.003, + "loss": 4.2762, + "step": 2205 + }, + { + "epoch": 0.02206, + "grad_norm": 0.9703192513292367, + "learning_rate": 0.003, + "loss": 4.26, + "step": 2206 + }, + { + "epoch": 0.02207, + "grad_norm": 0.8841717513591386, + "learning_rate": 0.003, + "loss": 4.2677, + "step": 2207 + }, + { + "epoch": 0.02208, + "grad_norm": 0.988099891532076, + "learning_rate": 0.003, + "loss": 4.2632, + "step": 2208 + }, + { + "epoch": 0.02209, + "grad_norm": 1.1146750705911779, + "learning_rate": 0.003, + "loss": 4.3, + "step": 2209 + }, + { + "epoch": 0.0221, + "grad_norm": 0.9650886464683209, + "learning_rate": 0.003, + "loss": 4.2786, + "step": 2210 + }, + { + "epoch": 0.02211, + "grad_norm": 0.8938651751677317, + "learning_rate": 0.003, + "loss": 4.289, + "step": 2211 + }, + { + "epoch": 0.02212, + "grad_norm": 0.8205609056746809, + "learning_rate": 0.003, + "loss": 4.2861, + "step": 2212 + }, + { + "epoch": 0.02213, + "grad_norm": 0.8079312852613237, + "learning_rate": 0.003, + "loss": 4.2922, + "step": 2213 + }, + { + "epoch": 0.02214, + "grad_norm": 0.6624076409511721, + "learning_rate": 0.003, + "loss": 4.2812, + "step": 2214 + }, + { + "epoch": 0.02215, + "grad_norm": 0.6346284348339961, + "learning_rate": 0.003, + "loss": 4.2836, + "step": 2215 + }, + { + "epoch": 0.02216, + "grad_norm": 0.6751649712191042, + "learning_rate": 0.003, + "loss": 4.262, + "step": 2216 + }, + { + "epoch": 0.02217, + "grad_norm": 0.7610112582408145, + "learning_rate": 0.003, + "loss": 4.2609, + "step": 2217 + }, + { + "epoch": 0.02218, + "grad_norm": 0.7718759523371377, + "learning_rate": 0.003, + "loss": 4.2931, + "step": 2218 + }, + { + "epoch": 0.02219, + "grad_norm": 0.7213187726292779, + "learning_rate": 0.003, + "loss": 4.2813, + "step": 2219 + }, + { + "epoch": 0.0222, + "grad_norm": 0.6931461020270434, + "learning_rate": 0.003, + "loss": 4.2766, + "step": 2220 + }, + { + "epoch": 0.02221, + "grad_norm": 0.6335484790352024, + "learning_rate": 0.003, + "loss": 4.2612, + "step": 2221 + }, + { + "epoch": 0.02222, + "grad_norm": 0.5200624241333043, + "learning_rate": 0.003, + "loss": 4.2557, + "step": 2222 + }, + { + "epoch": 0.02223, + "grad_norm": 0.47510216931250304, + "learning_rate": 0.003, + "loss": 4.2633, + "step": 2223 + }, + { + "epoch": 0.02224, + "grad_norm": 0.4708153372606544, + "learning_rate": 0.003, + "loss": 4.2436, + "step": 2224 + }, + { + "epoch": 0.02225, + "grad_norm": 0.4713839636734649, + "learning_rate": 0.003, + "loss": 4.2455, + "step": 2225 + }, + { + "epoch": 0.02226, + "grad_norm": 0.45242642141513834, + "learning_rate": 0.003, + "loss": 4.2572, + "step": 2226 + }, + { + "epoch": 0.02227, + "grad_norm": 0.5403277615094669, + "learning_rate": 0.003, + "loss": 4.2506, + "step": 2227 + }, + { + "epoch": 0.02228, + "grad_norm": 0.5419608649795276, + "learning_rate": 0.003, + "loss": 4.2378, + "step": 2228 + }, + { + "epoch": 0.02229, + "grad_norm": 0.5074364347379132, + "learning_rate": 0.003, + "loss": 4.2163, + "step": 2229 + }, + { + "epoch": 0.0223, + "grad_norm": 0.44694138444366344, + "learning_rate": 0.003, + "loss": 4.2496, + "step": 2230 + }, + { + "epoch": 0.02231, + "grad_norm": 0.3954086546661809, + "learning_rate": 0.003, + "loss": 4.2523, + "step": 2231 + }, + { + "epoch": 0.02232, + "grad_norm": 0.38774210113182334, + "learning_rate": 0.003, + "loss": 4.2551, + "step": 2232 + }, + { + "epoch": 0.02233, + "grad_norm": 0.33836180367798796, + "learning_rate": 0.003, + "loss": 4.2207, + "step": 2233 + }, + { + "epoch": 0.02234, + "grad_norm": 0.4026067706239707, + "learning_rate": 0.003, + "loss": 4.2423, + "step": 2234 + }, + { + "epoch": 0.02235, + "grad_norm": 0.3714972499375364, + "learning_rate": 0.003, + "loss": 4.2633, + "step": 2235 + }, + { + "epoch": 0.02236, + "grad_norm": 0.3656117183652505, + "learning_rate": 0.003, + "loss": 4.2142, + "step": 2236 + }, + { + "epoch": 0.02237, + "grad_norm": 0.3860156575975231, + "learning_rate": 0.003, + "loss": 4.248, + "step": 2237 + }, + { + "epoch": 0.02238, + "grad_norm": 0.40106624326868706, + "learning_rate": 0.003, + "loss": 4.227, + "step": 2238 + }, + { + "epoch": 0.02239, + "grad_norm": 0.344415203397115, + "learning_rate": 0.003, + "loss": 4.2104, + "step": 2239 + }, + { + "epoch": 0.0224, + "grad_norm": 0.30398409326848724, + "learning_rate": 0.003, + "loss": 4.1926, + "step": 2240 + }, + { + "epoch": 0.02241, + "grad_norm": 0.29910610471963883, + "learning_rate": 0.003, + "loss": 4.2222, + "step": 2241 + }, + { + "epoch": 0.02242, + "grad_norm": 0.32765283542789003, + "learning_rate": 0.003, + "loss": 4.2062, + "step": 2242 + }, + { + "epoch": 0.02243, + "grad_norm": 0.47068547305005354, + "learning_rate": 0.003, + "loss": 4.2397, + "step": 2243 + }, + { + "epoch": 0.02244, + "grad_norm": 0.619857801317153, + "learning_rate": 0.003, + "loss": 4.2209, + "step": 2244 + }, + { + "epoch": 0.02245, + "grad_norm": 0.7578137247550054, + "learning_rate": 0.003, + "loss": 4.214, + "step": 2245 + }, + { + "epoch": 0.02246, + "grad_norm": 0.7174200654723875, + "learning_rate": 0.003, + "loss": 4.2474, + "step": 2246 + }, + { + "epoch": 0.02247, + "grad_norm": 0.5627816069554801, + "learning_rate": 0.003, + "loss": 4.2147, + "step": 2247 + }, + { + "epoch": 0.02248, + "grad_norm": 0.645246341768817, + "learning_rate": 0.003, + "loss": 4.2267, + "step": 2248 + }, + { + "epoch": 0.02249, + "grad_norm": 0.5894073242429686, + "learning_rate": 0.003, + "loss": 4.2178, + "step": 2249 + }, + { + "epoch": 0.0225, + "grad_norm": 0.4708614120277191, + "learning_rate": 0.003, + "loss": 4.2264, + "step": 2250 + }, + { + "epoch": 0.02251, + "grad_norm": 0.5732703006696422, + "learning_rate": 0.003, + "loss": 4.2516, + "step": 2251 + }, + { + "epoch": 0.02252, + "grad_norm": 0.4876319361460763, + "learning_rate": 0.003, + "loss": 4.2397, + "step": 2252 + }, + { + "epoch": 0.02253, + "grad_norm": 0.4518791398680973, + "learning_rate": 0.003, + "loss": 4.2182, + "step": 2253 + }, + { + "epoch": 0.02254, + "grad_norm": 0.4686604716989295, + "learning_rate": 0.003, + "loss": 4.2242, + "step": 2254 + }, + { + "epoch": 0.02255, + "grad_norm": 0.46315510081369105, + "learning_rate": 0.003, + "loss": 4.2076, + "step": 2255 + }, + { + "epoch": 0.02256, + "grad_norm": 0.4115888436676222, + "learning_rate": 0.003, + "loss": 4.196, + "step": 2256 + }, + { + "epoch": 0.02257, + "grad_norm": 0.4250881653478155, + "learning_rate": 0.003, + "loss": 4.213, + "step": 2257 + }, + { + "epoch": 0.02258, + "grad_norm": 0.41687848922253873, + "learning_rate": 0.003, + "loss": 4.2225, + "step": 2258 + }, + { + "epoch": 0.02259, + "grad_norm": 0.42540602594527765, + "learning_rate": 0.003, + "loss": 4.1835, + "step": 2259 + }, + { + "epoch": 0.0226, + "grad_norm": 0.4465875569855491, + "learning_rate": 0.003, + "loss": 4.2393, + "step": 2260 + }, + { + "epoch": 0.02261, + "grad_norm": 0.4515532217997496, + "learning_rate": 0.003, + "loss": 4.2163, + "step": 2261 + }, + { + "epoch": 0.02262, + "grad_norm": 0.47647870630071837, + "learning_rate": 0.003, + "loss": 4.2167, + "step": 2262 + }, + { + "epoch": 0.02263, + "grad_norm": 0.5731324666338504, + "learning_rate": 0.003, + "loss": 4.2339, + "step": 2263 + }, + { + "epoch": 0.02264, + "grad_norm": 0.5964954733768894, + "learning_rate": 0.003, + "loss": 4.2281, + "step": 2264 + }, + { + "epoch": 0.02265, + "grad_norm": 0.5712613063923222, + "learning_rate": 0.003, + "loss": 4.1855, + "step": 2265 + }, + { + "epoch": 0.02266, + "grad_norm": 0.5312684023721527, + "learning_rate": 0.003, + "loss": 4.2007, + "step": 2266 + }, + { + "epoch": 0.02267, + "grad_norm": 0.5398877397723375, + "learning_rate": 0.003, + "loss": 4.2294, + "step": 2267 + }, + { + "epoch": 0.02268, + "grad_norm": 0.6097072952396848, + "learning_rate": 0.003, + "loss": 4.2058, + "step": 2268 + }, + { + "epoch": 0.02269, + "grad_norm": 0.667945707294233, + "learning_rate": 0.003, + "loss": 4.2339, + "step": 2269 + }, + { + "epoch": 0.0227, + "grad_norm": 0.6780050185078733, + "learning_rate": 0.003, + "loss": 4.23, + "step": 2270 + }, + { + "epoch": 0.02271, + "grad_norm": 0.6140341865228114, + "learning_rate": 0.003, + "loss": 4.2328, + "step": 2271 + }, + { + "epoch": 0.02272, + "grad_norm": 0.6077740705952491, + "learning_rate": 0.003, + "loss": 4.2343, + "step": 2272 + }, + { + "epoch": 0.02273, + "grad_norm": 0.6228614859955112, + "learning_rate": 0.003, + "loss": 4.237, + "step": 2273 + }, + { + "epoch": 0.02274, + "grad_norm": 0.6497689324113504, + "learning_rate": 0.003, + "loss": 4.2221, + "step": 2274 + }, + { + "epoch": 0.02275, + "grad_norm": 0.6593608950407358, + "learning_rate": 0.003, + "loss": 4.2201, + "step": 2275 + }, + { + "epoch": 0.02276, + "grad_norm": 0.6819172040141601, + "learning_rate": 0.003, + "loss": 4.2138, + "step": 2276 + }, + { + "epoch": 0.02277, + "grad_norm": 0.7620205189580369, + "learning_rate": 0.003, + "loss": 4.2321, + "step": 2277 + }, + { + "epoch": 0.02278, + "grad_norm": 0.622995687715391, + "learning_rate": 0.003, + "loss": 4.2327, + "step": 2278 + }, + { + "epoch": 0.02279, + "grad_norm": 0.5747298625450237, + "learning_rate": 0.003, + "loss": 4.2411, + "step": 2279 + }, + { + "epoch": 0.0228, + "grad_norm": 0.5988670118192916, + "learning_rate": 0.003, + "loss": 4.2313, + "step": 2280 + }, + { + "epoch": 0.02281, + "grad_norm": 0.611381401213052, + "learning_rate": 0.003, + "loss": 4.2272, + "step": 2281 + }, + { + "epoch": 0.02282, + "grad_norm": 0.5846556638089334, + "learning_rate": 0.003, + "loss": 4.2409, + "step": 2282 + }, + { + "epoch": 0.02283, + "grad_norm": 0.5475554139402415, + "learning_rate": 0.003, + "loss": 4.2536, + "step": 2283 + }, + { + "epoch": 0.02284, + "grad_norm": 0.5238952449646747, + "learning_rate": 0.003, + "loss": 4.2482, + "step": 2284 + }, + { + "epoch": 0.02285, + "grad_norm": 0.5449063604054842, + "learning_rate": 0.003, + "loss": 4.1851, + "step": 2285 + }, + { + "epoch": 0.02286, + "grad_norm": 0.5626629932890803, + "learning_rate": 0.003, + "loss": 4.25, + "step": 2286 + }, + { + "epoch": 0.02287, + "grad_norm": 0.5354326845671384, + "learning_rate": 0.003, + "loss": 4.2276, + "step": 2287 + }, + { + "epoch": 0.02288, + "grad_norm": 0.5865237537888423, + "learning_rate": 0.003, + "loss": 4.2517, + "step": 2288 + }, + { + "epoch": 0.02289, + "grad_norm": 0.6587817447859279, + "learning_rate": 0.003, + "loss": 4.2552, + "step": 2289 + }, + { + "epoch": 0.0229, + "grad_norm": 0.763374859559503, + "learning_rate": 0.003, + "loss": 4.2369, + "step": 2290 + }, + { + "epoch": 0.02291, + "grad_norm": 0.9238217255206519, + "learning_rate": 0.003, + "loss": 4.2327, + "step": 2291 + }, + { + "epoch": 0.02292, + "grad_norm": 0.8375515673022389, + "learning_rate": 0.003, + "loss": 4.2276, + "step": 2292 + }, + { + "epoch": 0.02293, + "grad_norm": 0.8229318263274273, + "learning_rate": 0.003, + "loss": 4.233, + "step": 2293 + }, + { + "epoch": 0.02294, + "grad_norm": 0.767668173749955, + "learning_rate": 0.003, + "loss": 4.2363, + "step": 2294 + }, + { + "epoch": 0.02295, + "grad_norm": 0.728858935661346, + "learning_rate": 0.003, + "loss": 4.2296, + "step": 2295 + }, + { + "epoch": 0.02296, + "grad_norm": 0.74447993660391, + "learning_rate": 0.003, + "loss": 4.2662, + "step": 2296 + }, + { + "epoch": 0.02297, + "grad_norm": 0.6968342842354955, + "learning_rate": 0.003, + "loss": 4.2327, + "step": 2297 + }, + { + "epoch": 0.02298, + "grad_norm": 0.6411785233315757, + "learning_rate": 0.003, + "loss": 4.2441, + "step": 2298 + }, + { + "epoch": 0.02299, + "grad_norm": 0.6765897982497444, + "learning_rate": 0.003, + "loss": 4.2472, + "step": 2299 + }, + { + "epoch": 0.023, + "grad_norm": 0.7218316432570615, + "learning_rate": 0.003, + "loss": 4.2659, + "step": 2300 + }, + { + "epoch": 0.02301, + "grad_norm": 0.6693972300748606, + "learning_rate": 0.003, + "loss": 4.2316, + "step": 2301 + }, + { + "epoch": 0.02302, + "grad_norm": 0.6521855282359511, + "learning_rate": 0.003, + "loss": 4.2347, + "step": 2302 + }, + { + "epoch": 0.02303, + "grad_norm": 0.5919587845688344, + "learning_rate": 0.003, + "loss": 4.224, + "step": 2303 + }, + { + "epoch": 0.02304, + "grad_norm": 0.602764977489394, + "learning_rate": 0.003, + "loss": 4.2562, + "step": 2304 + }, + { + "epoch": 0.02305, + "grad_norm": 0.5558668601822961, + "learning_rate": 0.003, + "loss": 4.2313, + "step": 2305 + }, + { + "epoch": 0.02306, + "grad_norm": 0.5156139404377212, + "learning_rate": 0.003, + "loss": 4.2058, + "step": 2306 + }, + { + "epoch": 0.02307, + "grad_norm": 0.5662595240344765, + "learning_rate": 0.003, + "loss": 4.2364, + "step": 2307 + }, + { + "epoch": 0.02308, + "grad_norm": 0.6173971988602593, + "learning_rate": 0.003, + "loss": 4.2276, + "step": 2308 + }, + { + "epoch": 0.02309, + "grad_norm": 0.7369887546423364, + "learning_rate": 0.003, + "loss": 4.261, + "step": 2309 + }, + { + "epoch": 0.0231, + "grad_norm": 0.7967301688365783, + "learning_rate": 0.003, + "loss": 4.2446, + "step": 2310 + }, + { + "epoch": 0.02311, + "grad_norm": 0.6089871818858962, + "learning_rate": 0.003, + "loss": 4.245, + "step": 2311 + }, + { + "epoch": 0.02312, + "grad_norm": 0.47586710546753136, + "learning_rate": 0.003, + "loss": 4.2111, + "step": 2312 + }, + { + "epoch": 0.02313, + "grad_norm": 0.4974927545735252, + "learning_rate": 0.003, + "loss": 4.2079, + "step": 2313 + }, + { + "epoch": 0.02314, + "grad_norm": 0.43769519573127175, + "learning_rate": 0.003, + "loss": 4.2204, + "step": 2314 + }, + { + "epoch": 0.02315, + "grad_norm": 0.401786726516975, + "learning_rate": 0.003, + "loss": 4.222, + "step": 2315 + }, + { + "epoch": 0.02316, + "grad_norm": 0.4259802780968188, + "learning_rate": 0.003, + "loss": 4.2341, + "step": 2316 + }, + { + "epoch": 0.02317, + "grad_norm": 0.41182452876226056, + "learning_rate": 0.003, + "loss": 4.2152, + "step": 2317 + }, + { + "epoch": 0.02318, + "grad_norm": 0.4136233759664175, + "learning_rate": 0.003, + "loss": 4.2028, + "step": 2318 + }, + { + "epoch": 0.02319, + "grad_norm": 0.3949029424988165, + "learning_rate": 0.003, + "loss": 4.2247, + "step": 2319 + }, + { + "epoch": 0.0232, + "grad_norm": 0.35504551442283205, + "learning_rate": 0.003, + "loss": 4.2312, + "step": 2320 + }, + { + "epoch": 0.02321, + "grad_norm": 0.32986956749591967, + "learning_rate": 0.003, + "loss": 4.2084, + "step": 2321 + }, + { + "epoch": 0.02322, + "grad_norm": 0.2968681197139734, + "learning_rate": 0.003, + "loss": 4.2156, + "step": 2322 + }, + { + "epoch": 0.02323, + "grad_norm": 0.3207801190374275, + "learning_rate": 0.003, + "loss": 4.1916, + "step": 2323 + }, + { + "epoch": 0.02324, + "grad_norm": 0.33142643285723733, + "learning_rate": 0.003, + "loss": 4.1895, + "step": 2324 + }, + { + "epoch": 0.02325, + "grad_norm": 0.3535725697371811, + "learning_rate": 0.003, + "loss": 4.215, + "step": 2325 + }, + { + "epoch": 0.02326, + "grad_norm": 0.3678851746936938, + "learning_rate": 0.003, + "loss": 4.2279, + "step": 2326 + }, + { + "epoch": 0.02327, + "grad_norm": 0.4101658150492574, + "learning_rate": 0.003, + "loss": 4.2161, + "step": 2327 + }, + { + "epoch": 0.02328, + "grad_norm": 0.4519738717854165, + "learning_rate": 0.003, + "loss": 4.2155, + "step": 2328 + }, + { + "epoch": 0.02329, + "grad_norm": 0.5647347753978986, + "learning_rate": 0.003, + "loss": 4.2089, + "step": 2329 + }, + { + "epoch": 0.0233, + "grad_norm": 0.6839573886066393, + "learning_rate": 0.003, + "loss": 4.2105, + "step": 2330 + }, + { + "epoch": 0.02331, + "grad_norm": 0.7998107409261247, + "learning_rate": 0.003, + "loss": 4.2543, + "step": 2331 + }, + { + "epoch": 0.02332, + "grad_norm": 0.689815752941326, + "learning_rate": 0.003, + "loss": 4.2161, + "step": 2332 + }, + { + "epoch": 0.02333, + "grad_norm": 0.6555251301313495, + "learning_rate": 0.003, + "loss": 4.222, + "step": 2333 + }, + { + "epoch": 0.02334, + "grad_norm": 0.6263058920174612, + "learning_rate": 0.003, + "loss": 4.1943, + "step": 2334 + }, + { + "epoch": 0.02335, + "grad_norm": 0.5873514688131092, + "learning_rate": 0.003, + "loss": 4.2307, + "step": 2335 + }, + { + "epoch": 0.02336, + "grad_norm": 0.6178829826634938, + "learning_rate": 0.003, + "loss": 4.2245, + "step": 2336 + }, + { + "epoch": 0.02337, + "grad_norm": 0.6776239992532147, + "learning_rate": 0.003, + "loss": 4.2338, + "step": 2337 + }, + { + "epoch": 0.02338, + "grad_norm": 0.6079153730498182, + "learning_rate": 0.003, + "loss": 4.2143, + "step": 2338 + }, + { + "epoch": 0.02339, + "grad_norm": 0.5540805388644168, + "learning_rate": 0.003, + "loss": 4.2097, + "step": 2339 + }, + { + "epoch": 0.0234, + "grad_norm": 0.4930522285796136, + "learning_rate": 0.003, + "loss": 4.2219, + "step": 2340 + }, + { + "epoch": 0.02341, + "grad_norm": 0.5114159866478617, + "learning_rate": 0.003, + "loss": 4.198, + "step": 2341 + }, + { + "epoch": 0.02342, + "grad_norm": 0.5088489673837915, + "learning_rate": 0.003, + "loss": 4.212, + "step": 2342 + }, + { + "epoch": 0.02343, + "grad_norm": 0.5701232150213591, + "learning_rate": 0.003, + "loss": 4.2136, + "step": 2343 + }, + { + "epoch": 0.02344, + "grad_norm": 0.5264496634765071, + "learning_rate": 0.003, + "loss": 4.2103, + "step": 2344 + }, + { + "epoch": 0.02345, + "grad_norm": 0.463092136026227, + "learning_rate": 0.003, + "loss": 4.2037, + "step": 2345 + }, + { + "epoch": 0.02346, + "grad_norm": 0.5119464591082599, + "learning_rate": 0.003, + "loss": 4.2041, + "step": 2346 + }, + { + "epoch": 0.02347, + "grad_norm": 0.5680858879896561, + "learning_rate": 0.003, + "loss": 4.2412, + "step": 2347 + }, + { + "epoch": 0.02348, + "grad_norm": 0.6575493478001748, + "learning_rate": 0.003, + "loss": 4.2098, + "step": 2348 + }, + { + "epoch": 0.02349, + "grad_norm": 0.7446709988793854, + "learning_rate": 0.003, + "loss": 4.2329, + "step": 2349 + }, + { + "epoch": 0.0235, + "grad_norm": 0.9616447862037343, + "learning_rate": 0.003, + "loss": 4.2417, + "step": 2350 + }, + { + "epoch": 0.02351, + "grad_norm": 0.892365430816041, + "learning_rate": 0.003, + "loss": 4.2567, + "step": 2351 + }, + { + "epoch": 0.02352, + "grad_norm": 0.7434115511547027, + "learning_rate": 0.003, + "loss": 4.2055, + "step": 2352 + }, + { + "epoch": 0.02353, + "grad_norm": 0.7750549493995498, + "learning_rate": 0.003, + "loss": 4.242, + "step": 2353 + }, + { + "epoch": 0.02354, + "grad_norm": 0.7464193045182832, + "learning_rate": 0.003, + "loss": 4.2509, + "step": 2354 + }, + { + "epoch": 0.02355, + "grad_norm": 0.6497010807293944, + "learning_rate": 0.003, + "loss": 4.2393, + "step": 2355 + }, + { + "epoch": 0.02356, + "grad_norm": 0.6914197602121488, + "learning_rate": 0.003, + "loss": 4.2239, + "step": 2356 + }, + { + "epoch": 0.02357, + "grad_norm": 0.6846425351668688, + "learning_rate": 0.003, + "loss": 4.2295, + "step": 2357 + }, + { + "epoch": 0.02358, + "grad_norm": 0.6290127860120783, + "learning_rate": 0.003, + "loss": 4.2459, + "step": 2358 + }, + { + "epoch": 0.02359, + "grad_norm": 0.6298376689313879, + "learning_rate": 0.003, + "loss": 4.2228, + "step": 2359 + }, + { + "epoch": 0.0236, + "grad_norm": 0.5166523858154795, + "learning_rate": 0.003, + "loss": 4.256, + "step": 2360 + }, + { + "epoch": 0.02361, + "grad_norm": 0.5392240288462758, + "learning_rate": 0.003, + "loss": 4.2371, + "step": 2361 + }, + { + "epoch": 0.02362, + "grad_norm": 0.46488034590567795, + "learning_rate": 0.003, + "loss": 4.2246, + "step": 2362 + }, + { + "epoch": 0.02363, + "grad_norm": 0.45695605203462375, + "learning_rate": 0.003, + "loss": 4.1978, + "step": 2363 + }, + { + "epoch": 0.02364, + "grad_norm": 0.450417146973962, + "learning_rate": 0.003, + "loss": 4.2171, + "step": 2364 + }, + { + "epoch": 0.02365, + "grad_norm": 0.46003087118242425, + "learning_rate": 0.003, + "loss": 4.236, + "step": 2365 + }, + { + "epoch": 0.02366, + "grad_norm": 0.48662211661939403, + "learning_rate": 0.003, + "loss": 4.2096, + "step": 2366 + }, + { + "epoch": 0.02367, + "grad_norm": 0.5645419223124684, + "learning_rate": 0.003, + "loss": 4.2212, + "step": 2367 + }, + { + "epoch": 0.02368, + "grad_norm": 0.5895187996336625, + "learning_rate": 0.003, + "loss": 4.1956, + "step": 2368 + }, + { + "epoch": 0.02369, + "grad_norm": 0.6664022667321152, + "learning_rate": 0.003, + "loss": 4.211, + "step": 2369 + }, + { + "epoch": 0.0237, + "grad_norm": 0.6992460452789251, + "learning_rate": 0.003, + "loss": 4.241, + "step": 2370 + }, + { + "epoch": 0.02371, + "grad_norm": 0.6591985126074834, + "learning_rate": 0.003, + "loss": 4.223, + "step": 2371 + }, + { + "epoch": 0.02372, + "grad_norm": 0.5265059072138487, + "learning_rate": 0.003, + "loss": 4.2082, + "step": 2372 + }, + { + "epoch": 0.02373, + "grad_norm": 0.4951201634347875, + "learning_rate": 0.003, + "loss": 4.1933, + "step": 2373 + }, + { + "epoch": 0.02374, + "grad_norm": 0.4876679210846313, + "learning_rate": 0.003, + "loss": 4.2011, + "step": 2374 + }, + { + "epoch": 0.02375, + "grad_norm": 0.408164979920852, + "learning_rate": 0.003, + "loss": 4.2017, + "step": 2375 + }, + { + "epoch": 0.02376, + "grad_norm": 0.3895969391943079, + "learning_rate": 0.003, + "loss": 4.2257, + "step": 2376 + }, + { + "epoch": 0.02377, + "grad_norm": 0.4053428947086922, + "learning_rate": 0.003, + "loss": 4.1923, + "step": 2377 + }, + { + "epoch": 0.02378, + "grad_norm": 0.43012507081905327, + "learning_rate": 0.003, + "loss": 4.2019, + "step": 2378 + }, + { + "epoch": 0.02379, + "grad_norm": 0.4764061060377796, + "learning_rate": 0.003, + "loss": 4.1906, + "step": 2379 + }, + { + "epoch": 0.0238, + "grad_norm": 0.4854688158872303, + "learning_rate": 0.003, + "loss": 4.1642, + "step": 2380 + }, + { + "epoch": 0.02381, + "grad_norm": 0.4926226608113921, + "learning_rate": 0.003, + "loss": 4.1768, + "step": 2381 + }, + { + "epoch": 0.02382, + "grad_norm": 0.5715519289577061, + "learning_rate": 0.003, + "loss": 4.2369, + "step": 2382 + }, + { + "epoch": 0.02383, + "grad_norm": 0.5852310729435649, + "learning_rate": 0.003, + "loss": 4.2016, + "step": 2383 + }, + { + "epoch": 0.02384, + "grad_norm": 0.5027933039192362, + "learning_rate": 0.003, + "loss": 4.186, + "step": 2384 + }, + { + "epoch": 0.02385, + "grad_norm": 0.5671992764095898, + "learning_rate": 0.003, + "loss": 4.2151, + "step": 2385 + }, + { + "epoch": 0.02386, + "grad_norm": 0.5716874891458956, + "learning_rate": 0.003, + "loss": 4.1952, + "step": 2386 + }, + { + "epoch": 0.02387, + "grad_norm": 0.5519898067526273, + "learning_rate": 0.003, + "loss": 4.1919, + "step": 2387 + }, + { + "epoch": 0.02388, + "grad_norm": 0.5749264518166138, + "learning_rate": 0.003, + "loss": 4.2477, + "step": 2388 + }, + { + "epoch": 0.02389, + "grad_norm": 0.6599062653773942, + "learning_rate": 0.003, + "loss": 4.2112, + "step": 2389 + }, + { + "epoch": 0.0239, + "grad_norm": 0.8007810046111682, + "learning_rate": 0.003, + "loss": 4.2104, + "step": 2390 + }, + { + "epoch": 0.02391, + "grad_norm": 0.7728699720546903, + "learning_rate": 0.003, + "loss": 4.2355, + "step": 2391 + }, + { + "epoch": 0.02392, + "grad_norm": 0.637372032061665, + "learning_rate": 0.003, + "loss": 4.2375, + "step": 2392 + }, + { + "epoch": 0.02393, + "grad_norm": 0.5856837925361781, + "learning_rate": 0.003, + "loss": 4.1967, + "step": 2393 + }, + { + "epoch": 0.02394, + "grad_norm": 0.5796441189535547, + "learning_rate": 0.003, + "loss": 4.2107, + "step": 2394 + }, + { + "epoch": 0.02395, + "grad_norm": 0.49276837104854343, + "learning_rate": 0.003, + "loss": 4.2217, + "step": 2395 + }, + { + "epoch": 0.02396, + "grad_norm": 0.4942734378430775, + "learning_rate": 0.003, + "loss": 4.2363, + "step": 2396 + }, + { + "epoch": 0.02397, + "grad_norm": 0.4893677735684375, + "learning_rate": 0.003, + "loss": 4.2162, + "step": 2397 + }, + { + "epoch": 0.02398, + "grad_norm": 0.5811783701919775, + "learning_rate": 0.003, + "loss": 4.2259, + "step": 2398 + }, + { + "epoch": 0.02399, + "grad_norm": 0.7848440120318722, + "learning_rate": 0.003, + "loss": 4.2123, + "step": 2399 + }, + { + "epoch": 0.024, + "grad_norm": 1.102610977529852, + "learning_rate": 0.003, + "loss": 4.2476, + "step": 2400 + }, + { + "epoch": 0.02401, + "grad_norm": 0.8135265365179677, + "learning_rate": 0.003, + "loss": 4.215, + "step": 2401 + }, + { + "epoch": 0.02402, + "grad_norm": 0.583172308220775, + "learning_rate": 0.003, + "loss": 4.24, + "step": 2402 + }, + { + "epoch": 0.02403, + "grad_norm": 0.5891421965222777, + "learning_rate": 0.003, + "loss": 4.2092, + "step": 2403 + }, + { + "epoch": 0.02404, + "grad_norm": 0.5648336236466582, + "learning_rate": 0.003, + "loss": 4.2446, + "step": 2404 + }, + { + "epoch": 0.02405, + "grad_norm": 0.5745395036103889, + "learning_rate": 0.003, + "loss": 4.2523, + "step": 2405 + }, + { + "epoch": 0.02406, + "grad_norm": 0.5533135454038, + "learning_rate": 0.003, + "loss": 4.2331, + "step": 2406 + }, + { + "epoch": 0.02407, + "grad_norm": 0.49929019237957983, + "learning_rate": 0.003, + "loss": 4.2073, + "step": 2407 + }, + { + "epoch": 0.02408, + "grad_norm": 0.4943118983366084, + "learning_rate": 0.003, + "loss": 4.2144, + "step": 2408 + }, + { + "epoch": 0.02409, + "grad_norm": 0.5449234611486594, + "learning_rate": 0.003, + "loss": 4.2212, + "step": 2409 + }, + { + "epoch": 0.0241, + "grad_norm": 0.585465535756614, + "learning_rate": 0.003, + "loss": 4.2358, + "step": 2410 + }, + { + "epoch": 0.02411, + "grad_norm": 0.6698135788102235, + "learning_rate": 0.003, + "loss": 4.2088, + "step": 2411 + }, + { + "epoch": 0.02412, + "grad_norm": 0.7369339920597645, + "learning_rate": 0.003, + "loss": 4.217, + "step": 2412 + }, + { + "epoch": 0.02413, + "grad_norm": 0.7305355993897886, + "learning_rate": 0.003, + "loss": 4.2166, + "step": 2413 + }, + { + "epoch": 0.02414, + "grad_norm": 0.7556720083696307, + "learning_rate": 0.003, + "loss": 4.2325, + "step": 2414 + }, + { + "epoch": 0.02415, + "grad_norm": 0.6579993879979832, + "learning_rate": 0.003, + "loss": 4.2326, + "step": 2415 + }, + { + "epoch": 0.02416, + "grad_norm": 0.6108411981053206, + "learning_rate": 0.003, + "loss": 4.2107, + "step": 2416 + }, + { + "epoch": 0.02417, + "grad_norm": 0.5049292600507906, + "learning_rate": 0.003, + "loss": 4.2353, + "step": 2417 + }, + { + "epoch": 0.02418, + "grad_norm": 0.4900011615352856, + "learning_rate": 0.003, + "loss": 4.2259, + "step": 2418 + }, + { + "epoch": 0.02419, + "grad_norm": 0.4521560630393701, + "learning_rate": 0.003, + "loss": 4.2185, + "step": 2419 + }, + { + "epoch": 0.0242, + "grad_norm": 0.4620681001428492, + "learning_rate": 0.003, + "loss": 4.2321, + "step": 2420 + }, + { + "epoch": 0.02421, + "grad_norm": 0.4916389353511205, + "learning_rate": 0.003, + "loss": 4.1947, + "step": 2421 + }, + { + "epoch": 0.02422, + "grad_norm": 0.5425187775211936, + "learning_rate": 0.003, + "loss": 4.2503, + "step": 2422 + }, + { + "epoch": 0.02423, + "grad_norm": 0.6038360240873328, + "learning_rate": 0.003, + "loss": 4.2434, + "step": 2423 + }, + { + "epoch": 0.02424, + "grad_norm": 0.6325505331996517, + "learning_rate": 0.003, + "loss": 4.2388, + "step": 2424 + }, + { + "epoch": 0.02425, + "grad_norm": 0.5934465012445626, + "learning_rate": 0.003, + "loss": 4.2245, + "step": 2425 + }, + { + "epoch": 0.02426, + "grad_norm": 0.4852317425419543, + "learning_rate": 0.003, + "loss": 4.2259, + "step": 2426 + }, + { + "epoch": 0.02427, + "grad_norm": 0.43424836675618983, + "learning_rate": 0.003, + "loss": 4.2507, + "step": 2427 + }, + { + "epoch": 0.02428, + "grad_norm": 0.42558088050674436, + "learning_rate": 0.003, + "loss": 4.2206, + "step": 2428 + }, + { + "epoch": 0.02429, + "grad_norm": 0.4007132170531262, + "learning_rate": 0.003, + "loss": 4.2052, + "step": 2429 + }, + { + "epoch": 0.0243, + "grad_norm": 0.41461133903064823, + "learning_rate": 0.003, + "loss": 4.2277, + "step": 2430 + }, + { + "epoch": 0.02431, + "grad_norm": 0.42272490135521923, + "learning_rate": 0.003, + "loss": 4.2037, + "step": 2431 + }, + { + "epoch": 0.02432, + "grad_norm": 0.5028259204616563, + "learning_rate": 0.003, + "loss": 4.2279, + "step": 2432 + }, + { + "epoch": 0.02433, + "grad_norm": 0.6593220419533437, + "learning_rate": 0.003, + "loss": 4.2479, + "step": 2433 + }, + { + "epoch": 0.02434, + "grad_norm": 0.8027427099690023, + "learning_rate": 0.003, + "loss": 4.2219, + "step": 2434 + }, + { + "epoch": 0.02435, + "grad_norm": 0.8024207699638355, + "learning_rate": 0.003, + "loss": 4.2281, + "step": 2435 + }, + { + "epoch": 0.02436, + "grad_norm": 0.7133615297883351, + "learning_rate": 0.003, + "loss": 4.2314, + "step": 2436 + }, + { + "epoch": 0.02437, + "grad_norm": 0.6439666201166401, + "learning_rate": 0.003, + "loss": 4.2142, + "step": 2437 + }, + { + "epoch": 0.02438, + "grad_norm": 0.5858625518676239, + "learning_rate": 0.003, + "loss": 4.23, + "step": 2438 + }, + { + "epoch": 0.02439, + "grad_norm": 0.5398100187318233, + "learning_rate": 0.003, + "loss": 4.1977, + "step": 2439 + }, + { + "epoch": 0.0244, + "grad_norm": 0.5287852378026328, + "learning_rate": 0.003, + "loss": 4.2469, + "step": 2440 + }, + { + "epoch": 0.02441, + "grad_norm": 0.5446152594519079, + "learning_rate": 0.003, + "loss": 4.2079, + "step": 2441 + }, + { + "epoch": 0.02442, + "grad_norm": 0.5050019658896749, + "learning_rate": 0.003, + "loss": 4.2415, + "step": 2442 + }, + { + "epoch": 0.02443, + "grad_norm": 0.5178076569823932, + "learning_rate": 0.003, + "loss": 4.1775, + "step": 2443 + }, + { + "epoch": 0.02444, + "grad_norm": 0.48889067635139094, + "learning_rate": 0.003, + "loss": 4.2045, + "step": 2444 + }, + { + "epoch": 0.02445, + "grad_norm": 0.5068159078989423, + "learning_rate": 0.003, + "loss": 4.2106, + "step": 2445 + }, + { + "epoch": 0.02446, + "grad_norm": 0.5126341533127841, + "learning_rate": 0.003, + "loss": 4.2121, + "step": 2446 + }, + { + "epoch": 0.02447, + "grad_norm": 0.4825218100648375, + "learning_rate": 0.003, + "loss": 4.2174, + "step": 2447 + }, + { + "epoch": 0.02448, + "grad_norm": 0.46377191023349634, + "learning_rate": 0.003, + "loss": 4.2169, + "step": 2448 + }, + { + "epoch": 0.02449, + "grad_norm": 0.5086550488721309, + "learning_rate": 0.003, + "loss": 4.2116, + "step": 2449 + }, + { + "epoch": 0.0245, + "grad_norm": 0.5740571687783603, + "learning_rate": 0.003, + "loss": 4.2159, + "step": 2450 + }, + { + "epoch": 0.02451, + "grad_norm": 0.5066757455693393, + "learning_rate": 0.003, + "loss": 4.1927, + "step": 2451 + }, + { + "epoch": 0.02452, + "grad_norm": 0.40863119709622003, + "learning_rate": 0.003, + "loss": 4.1968, + "step": 2452 + }, + { + "epoch": 0.02453, + "grad_norm": 0.44965288153972177, + "learning_rate": 0.003, + "loss": 4.1737, + "step": 2453 + }, + { + "epoch": 0.02454, + "grad_norm": 0.5324108231499827, + "learning_rate": 0.003, + "loss": 4.1829, + "step": 2454 + }, + { + "epoch": 0.02455, + "grad_norm": 0.6445398719560812, + "learning_rate": 0.003, + "loss": 4.1993, + "step": 2455 + }, + { + "epoch": 0.02456, + "grad_norm": 0.778972240900702, + "learning_rate": 0.003, + "loss": 4.1741, + "step": 2456 + }, + { + "epoch": 0.02457, + "grad_norm": 0.6895291525630388, + "learning_rate": 0.003, + "loss": 4.2281, + "step": 2457 + }, + { + "epoch": 0.02458, + "grad_norm": 0.5932547289808822, + "learning_rate": 0.003, + "loss": 4.1934, + "step": 2458 + }, + { + "epoch": 0.02459, + "grad_norm": 0.5873468472438081, + "learning_rate": 0.003, + "loss": 4.2234, + "step": 2459 + }, + { + "epoch": 0.0246, + "grad_norm": 0.6051334869486308, + "learning_rate": 0.003, + "loss": 4.2298, + "step": 2460 + }, + { + "epoch": 0.02461, + "grad_norm": 0.598960720110885, + "learning_rate": 0.003, + "loss": 4.2047, + "step": 2461 + }, + { + "epoch": 0.02462, + "grad_norm": 0.5336669037751343, + "learning_rate": 0.003, + "loss": 4.2313, + "step": 2462 + }, + { + "epoch": 0.02463, + "grad_norm": 0.5634439620834016, + "learning_rate": 0.003, + "loss": 4.2301, + "step": 2463 + }, + { + "epoch": 0.02464, + "grad_norm": 0.6026663982662849, + "learning_rate": 0.003, + "loss": 4.2123, + "step": 2464 + }, + { + "epoch": 0.02465, + "grad_norm": 0.7167643023421324, + "learning_rate": 0.003, + "loss": 4.2336, + "step": 2465 + }, + { + "epoch": 0.02466, + "grad_norm": 0.7713681863589867, + "learning_rate": 0.003, + "loss": 4.2172, + "step": 2466 + }, + { + "epoch": 0.02467, + "grad_norm": 0.6973139861610792, + "learning_rate": 0.003, + "loss": 4.1958, + "step": 2467 + }, + { + "epoch": 0.02468, + "grad_norm": 0.6102611547152099, + "learning_rate": 0.003, + "loss": 4.2374, + "step": 2468 + }, + { + "epoch": 0.02469, + "grad_norm": 0.6541217572054333, + "learning_rate": 0.003, + "loss": 4.2207, + "step": 2469 + }, + { + "epoch": 0.0247, + "grad_norm": 0.6679631943322341, + "learning_rate": 0.003, + "loss": 4.2165, + "step": 2470 + }, + { + "epoch": 0.02471, + "grad_norm": 0.7159740472340401, + "learning_rate": 0.003, + "loss": 4.2074, + "step": 2471 + }, + { + "epoch": 0.02472, + "grad_norm": 0.7905007318669144, + "learning_rate": 0.003, + "loss": 4.2152, + "step": 2472 + }, + { + "epoch": 0.02473, + "grad_norm": 0.8760585933429962, + "learning_rate": 0.003, + "loss": 4.2103, + "step": 2473 + }, + { + "epoch": 0.02474, + "grad_norm": 0.9178632648529088, + "learning_rate": 0.003, + "loss": 4.2472, + "step": 2474 + }, + { + "epoch": 0.02475, + "grad_norm": 0.8011380404965301, + "learning_rate": 0.003, + "loss": 4.2214, + "step": 2475 + }, + { + "epoch": 0.02476, + "grad_norm": 0.8709257402916254, + "learning_rate": 0.003, + "loss": 4.2334, + "step": 2476 + }, + { + "epoch": 0.02477, + "grad_norm": 0.8785057383582693, + "learning_rate": 0.003, + "loss": 4.2701, + "step": 2477 + }, + { + "epoch": 0.02478, + "grad_norm": 0.8265644129631496, + "learning_rate": 0.003, + "loss": 4.2383, + "step": 2478 + }, + { + "epoch": 0.02479, + "grad_norm": 0.7447630055654997, + "learning_rate": 0.003, + "loss": 4.228, + "step": 2479 + }, + { + "epoch": 0.0248, + "grad_norm": 0.7215953534640771, + "learning_rate": 0.003, + "loss": 4.2659, + "step": 2480 + }, + { + "epoch": 0.02481, + "grad_norm": 0.7223104530962532, + "learning_rate": 0.003, + "loss": 4.2494, + "step": 2481 + }, + { + "epoch": 0.02482, + "grad_norm": 0.7755415514978912, + "learning_rate": 0.003, + "loss": 4.2642, + "step": 2482 + }, + { + "epoch": 0.02483, + "grad_norm": 0.8000293423835445, + "learning_rate": 0.003, + "loss": 4.2156, + "step": 2483 + }, + { + "epoch": 0.02484, + "grad_norm": 0.6385982842425331, + "learning_rate": 0.003, + "loss": 4.2408, + "step": 2484 + }, + { + "epoch": 0.02485, + "grad_norm": 0.7165633164736369, + "learning_rate": 0.003, + "loss": 4.2628, + "step": 2485 + }, + { + "epoch": 0.02486, + "grad_norm": 0.666869104779761, + "learning_rate": 0.003, + "loss": 4.2845, + "step": 2486 + }, + { + "epoch": 0.02487, + "grad_norm": 0.6526919813636585, + "learning_rate": 0.003, + "loss": 4.2376, + "step": 2487 + }, + { + "epoch": 0.02488, + "grad_norm": 0.6689740712518896, + "learning_rate": 0.003, + "loss": 4.2153, + "step": 2488 + }, + { + "epoch": 0.02489, + "grad_norm": 0.6762316340846642, + "learning_rate": 0.003, + "loss": 4.2054, + "step": 2489 + }, + { + "epoch": 0.0249, + "grad_norm": 0.6381111552878137, + "learning_rate": 0.003, + "loss": 4.2081, + "step": 2490 + }, + { + "epoch": 0.02491, + "grad_norm": 0.6575656650220585, + "learning_rate": 0.003, + "loss": 4.234, + "step": 2491 + }, + { + "epoch": 0.02492, + "grad_norm": 0.6264756996157673, + "learning_rate": 0.003, + "loss": 4.243, + "step": 2492 + }, + { + "epoch": 0.02493, + "grad_norm": 0.5475484048874111, + "learning_rate": 0.003, + "loss": 4.2085, + "step": 2493 + }, + { + "epoch": 0.02494, + "grad_norm": 0.5832237103576846, + "learning_rate": 0.003, + "loss": 4.222, + "step": 2494 + }, + { + "epoch": 0.02495, + "grad_norm": 0.49782209661562854, + "learning_rate": 0.003, + "loss": 4.2274, + "step": 2495 + }, + { + "epoch": 0.02496, + "grad_norm": 0.403753171755924, + "learning_rate": 0.003, + "loss": 4.2141, + "step": 2496 + }, + { + "epoch": 0.02497, + "grad_norm": 0.36378014443485196, + "learning_rate": 0.003, + "loss": 4.2165, + "step": 2497 + }, + { + "epoch": 0.02498, + "grad_norm": 0.317230162546613, + "learning_rate": 0.003, + "loss": 4.2025, + "step": 2498 + }, + { + "epoch": 0.02499, + "grad_norm": 0.3277491553637038, + "learning_rate": 0.003, + "loss": 4.2117, + "step": 2499 + }, + { + "epoch": 0.025, + "grad_norm": 0.2882430663823555, + "learning_rate": 0.003, + "loss": 4.1956, + "step": 2500 + }, + { + "epoch": 0.02501, + "grad_norm": 0.27906079169809134, + "learning_rate": 0.003, + "loss": 4.2251, + "step": 2501 + }, + { + "epoch": 0.02502, + "grad_norm": 0.2871738503175601, + "learning_rate": 0.003, + "loss": 4.2056, + "step": 2502 + }, + { + "epoch": 0.02503, + "grad_norm": 0.3203204666233, + "learning_rate": 0.003, + "loss": 4.2125, + "step": 2503 + }, + { + "epoch": 0.02504, + "grad_norm": 0.38939921573272746, + "learning_rate": 0.003, + "loss": 4.1919, + "step": 2504 + }, + { + "epoch": 0.02505, + "grad_norm": 0.5688786331558378, + "learning_rate": 0.003, + "loss": 4.1964, + "step": 2505 + }, + { + "epoch": 0.02506, + "grad_norm": 0.7962360843283562, + "learning_rate": 0.003, + "loss": 4.2243, + "step": 2506 + }, + { + "epoch": 0.02507, + "grad_norm": 0.8008164951690123, + "learning_rate": 0.003, + "loss": 4.2174, + "step": 2507 + }, + { + "epoch": 0.02508, + "grad_norm": 0.5143170677845642, + "learning_rate": 0.003, + "loss": 4.2001, + "step": 2508 + }, + { + "epoch": 0.02509, + "grad_norm": 0.6566394093706333, + "learning_rate": 0.003, + "loss": 4.2119, + "step": 2509 + }, + { + "epoch": 0.0251, + "grad_norm": 0.7710410640765211, + "learning_rate": 0.003, + "loss": 4.2285, + "step": 2510 + }, + { + "epoch": 0.02511, + "grad_norm": 0.6799202006125559, + "learning_rate": 0.003, + "loss": 4.2382, + "step": 2511 + }, + { + "epoch": 0.02512, + "grad_norm": 0.6083471860257056, + "learning_rate": 0.003, + "loss": 4.2155, + "step": 2512 + }, + { + "epoch": 0.02513, + "grad_norm": 0.6298329580111398, + "learning_rate": 0.003, + "loss": 4.2139, + "step": 2513 + }, + { + "epoch": 0.02514, + "grad_norm": 0.5557311367958748, + "learning_rate": 0.003, + "loss": 4.1968, + "step": 2514 + }, + { + "epoch": 0.02515, + "grad_norm": 0.5172156322515098, + "learning_rate": 0.003, + "loss": 4.1861, + "step": 2515 + }, + { + "epoch": 0.02516, + "grad_norm": 0.458365154058261, + "learning_rate": 0.003, + "loss": 4.2042, + "step": 2516 + }, + { + "epoch": 0.02517, + "grad_norm": 0.47236102620897663, + "learning_rate": 0.003, + "loss": 4.2043, + "step": 2517 + }, + { + "epoch": 0.02518, + "grad_norm": 0.3986825790599497, + "learning_rate": 0.003, + "loss": 4.2114, + "step": 2518 + }, + { + "epoch": 0.02519, + "grad_norm": 0.402047193379362, + "learning_rate": 0.003, + "loss": 4.19, + "step": 2519 + }, + { + "epoch": 0.0252, + "grad_norm": 0.3916406367114395, + "learning_rate": 0.003, + "loss": 4.1997, + "step": 2520 + }, + { + "epoch": 0.02521, + "grad_norm": 0.41614255260417965, + "learning_rate": 0.003, + "loss": 4.196, + "step": 2521 + }, + { + "epoch": 0.02522, + "grad_norm": 0.39122808601170617, + "learning_rate": 0.003, + "loss": 4.1916, + "step": 2522 + }, + { + "epoch": 0.02523, + "grad_norm": 0.32265648253713153, + "learning_rate": 0.003, + "loss": 4.1909, + "step": 2523 + }, + { + "epoch": 0.02524, + "grad_norm": 0.36648634510918254, + "learning_rate": 0.003, + "loss": 4.2042, + "step": 2524 + }, + { + "epoch": 0.02525, + "grad_norm": 0.36515875028981976, + "learning_rate": 0.003, + "loss": 4.2171, + "step": 2525 + }, + { + "epoch": 0.02526, + "grad_norm": 0.3879027742116084, + "learning_rate": 0.003, + "loss": 4.1612, + "step": 2526 + }, + { + "epoch": 0.02527, + "grad_norm": 0.39717009730244557, + "learning_rate": 0.003, + "loss": 4.1965, + "step": 2527 + }, + { + "epoch": 0.02528, + "grad_norm": 0.4631846026712896, + "learning_rate": 0.003, + "loss": 4.2219, + "step": 2528 + }, + { + "epoch": 0.02529, + "grad_norm": 0.5301783594747518, + "learning_rate": 0.003, + "loss": 4.1932, + "step": 2529 + }, + { + "epoch": 0.0253, + "grad_norm": 0.5779986723238283, + "learning_rate": 0.003, + "loss": 4.2154, + "step": 2530 + }, + { + "epoch": 0.02531, + "grad_norm": 0.5503995974353456, + "learning_rate": 0.003, + "loss": 4.2161, + "step": 2531 + }, + { + "epoch": 0.02532, + "grad_norm": 0.5718882840372597, + "learning_rate": 0.003, + "loss": 4.1838, + "step": 2532 + }, + { + "epoch": 0.02533, + "grad_norm": 0.6687455514677941, + "learning_rate": 0.003, + "loss": 4.2279, + "step": 2533 + }, + { + "epoch": 0.02534, + "grad_norm": 0.6203399011184582, + "learning_rate": 0.003, + "loss": 4.2081, + "step": 2534 + }, + { + "epoch": 0.02535, + "grad_norm": 0.7566323425219436, + "learning_rate": 0.003, + "loss": 4.205, + "step": 2535 + }, + { + "epoch": 0.02536, + "grad_norm": 1.072654720911659, + "learning_rate": 0.003, + "loss": 4.2219, + "step": 2536 + }, + { + "epoch": 0.02537, + "grad_norm": 1.0281329924391467, + "learning_rate": 0.003, + "loss": 4.2253, + "step": 2537 + }, + { + "epoch": 0.02538, + "grad_norm": 0.9199583773276817, + "learning_rate": 0.003, + "loss": 4.2009, + "step": 2538 + }, + { + "epoch": 0.02539, + "grad_norm": 0.8211807842712615, + "learning_rate": 0.003, + "loss": 4.2563, + "step": 2539 + }, + { + "epoch": 0.0254, + "grad_norm": 0.8400545711442372, + "learning_rate": 0.003, + "loss": 4.2509, + "step": 2540 + }, + { + "epoch": 0.02541, + "grad_norm": 0.820158631703877, + "learning_rate": 0.003, + "loss": 4.2399, + "step": 2541 + }, + { + "epoch": 0.02542, + "grad_norm": 0.7974385353772132, + "learning_rate": 0.003, + "loss": 4.2171, + "step": 2542 + }, + { + "epoch": 0.02543, + "grad_norm": 0.8594194387358546, + "learning_rate": 0.003, + "loss": 4.2551, + "step": 2543 + }, + { + "epoch": 0.02544, + "grad_norm": 0.7910126968684995, + "learning_rate": 0.003, + "loss": 4.244, + "step": 2544 + }, + { + "epoch": 0.02545, + "grad_norm": 0.6829853653468553, + "learning_rate": 0.003, + "loss": 4.216, + "step": 2545 + }, + { + "epoch": 0.02546, + "grad_norm": 0.7120189223135196, + "learning_rate": 0.003, + "loss": 4.2304, + "step": 2546 + }, + { + "epoch": 0.02547, + "grad_norm": 0.6042723068123944, + "learning_rate": 0.003, + "loss": 4.2606, + "step": 2547 + }, + { + "epoch": 0.02548, + "grad_norm": 0.6298594843243801, + "learning_rate": 0.003, + "loss": 4.2602, + "step": 2548 + }, + { + "epoch": 0.02549, + "grad_norm": 0.5999904649113489, + "learning_rate": 0.003, + "loss": 4.2469, + "step": 2549 + }, + { + "epoch": 0.0255, + "grad_norm": 0.6271074405015903, + "learning_rate": 0.003, + "loss": 4.2107, + "step": 2550 + }, + { + "epoch": 0.02551, + "grad_norm": 0.6199575999348788, + "learning_rate": 0.003, + "loss": 4.2396, + "step": 2551 + }, + { + "epoch": 0.02552, + "grad_norm": 0.6195762543397098, + "learning_rate": 0.003, + "loss": 4.2076, + "step": 2552 + }, + { + "epoch": 0.02553, + "grad_norm": 0.6695668966322667, + "learning_rate": 0.003, + "loss": 4.2746, + "step": 2553 + }, + { + "epoch": 0.02554, + "grad_norm": 0.7359151892369767, + "learning_rate": 0.003, + "loss": 4.2245, + "step": 2554 + }, + { + "epoch": 0.02555, + "grad_norm": 0.7444694590769985, + "learning_rate": 0.003, + "loss": 4.2187, + "step": 2555 + }, + { + "epoch": 0.02556, + "grad_norm": 0.5563951149157678, + "learning_rate": 0.003, + "loss": 4.2212, + "step": 2556 + }, + { + "epoch": 0.02557, + "grad_norm": 0.463326630335006, + "learning_rate": 0.003, + "loss": 4.2204, + "step": 2557 + }, + { + "epoch": 0.02558, + "grad_norm": 0.4775061885903974, + "learning_rate": 0.003, + "loss": 4.2352, + "step": 2558 + }, + { + "epoch": 0.02559, + "grad_norm": 0.45507178814749144, + "learning_rate": 0.003, + "loss": 4.229, + "step": 2559 + }, + { + "epoch": 0.0256, + "grad_norm": 0.44496507541292724, + "learning_rate": 0.003, + "loss": 4.202, + "step": 2560 + }, + { + "epoch": 0.02561, + "grad_norm": 0.3670426288233485, + "learning_rate": 0.003, + "loss": 4.1971, + "step": 2561 + }, + { + "epoch": 0.02562, + "grad_norm": 0.38316899243773206, + "learning_rate": 0.003, + "loss": 4.2137, + "step": 2562 + }, + { + "epoch": 0.02563, + "grad_norm": 0.3516799307048104, + "learning_rate": 0.003, + "loss": 4.1944, + "step": 2563 + }, + { + "epoch": 0.02564, + "grad_norm": 0.3439157566474706, + "learning_rate": 0.003, + "loss": 4.2139, + "step": 2564 + }, + { + "epoch": 0.02565, + "grad_norm": 0.3530781028336598, + "learning_rate": 0.003, + "loss": 4.1893, + "step": 2565 + }, + { + "epoch": 0.02566, + "grad_norm": 0.3535153102961858, + "learning_rate": 0.003, + "loss": 4.2125, + "step": 2566 + }, + { + "epoch": 0.02567, + "grad_norm": 0.41816993390160107, + "learning_rate": 0.003, + "loss": 4.2115, + "step": 2567 + }, + { + "epoch": 0.02568, + "grad_norm": 0.4566608866652098, + "learning_rate": 0.003, + "loss": 4.2058, + "step": 2568 + }, + { + "epoch": 0.02569, + "grad_norm": 0.5020275746642583, + "learning_rate": 0.003, + "loss": 4.1895, + "step": 2569 + }, + { + "epoch": 0.0257, + "grad_norm": 0.45491911821241204, + "learning_rate": 0.003, + "loss": 4.1923, + "step": 2570 + }, + { + "epoch": 0.02571, + "grad_norm": 0.4133535089644209, + "learning_rate": 0.003, + "loss": 4.2113, + "step": 2571 + }, + { + "epoch": 0.02572, + "grad_norm": 0.4042168971597752, + "learning_rate": 0.003, + "loss": 4.1874, + "step": 2572 + }, + { + "epoch": 0.02573, + "grad_norm": 0.3823197589786481, + "learning_rate": 0.003, + "loss": 4.1933, + "step": 2573 + }, + { + "epoch": 0.02574, + "grad_norm": 0.39218587765755697, + "learning_rate": 0.003, + "loss": 4.2238, + "step": 2574 + }, + { + "epoch": 0.02575, + "grad_norm": 0.3438035712553242, + "learning_rate": 0.003, + "loss": 4.1541, + "step": 2575 + }, + { + "epoch": 0.02576, + "grad_norm": 0.3842191419529875, + "learning_rate": 0.003, + "loss": 4.1626, + "step": 2576 + }, + { + "epoch": 0.02577, + "grad_norm": 0.42583222296628487, + "learning_rate": 0.003, + "loss": 4.1677, + "step": 2577 + }, + { + "epoch": 0.02578, + "grad_norm": 0.42273441184052274, + "learning_rate": 0.003, + "loss": 4.1902, + "step": 2578 + }, + { + "epoch": 0.02579, + "grad_norm": 0.46892691434613215, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 2579 + }, + { + "epoch": 0.0258, + "grad_norm": 0.5709292370942015, + "learning_rate": 0.003, + "loss": 4.199, + "step": 2580 + }, + { + "epoch": 0.02581, + "grad_norm": 0.7127132853972009, + "learning_rate": 0.003, + "loss": 4.2278, + "step": 2581 + }, + { + "epoch": 0.02582, + "grad_norm": 0.8754198005529624, + "learning_rate": 0.003, + "loss": 4.2358, + "step": 2582 + }, + { + "epoch": 0.02583, + "grad_norm": 0.8240175300819178, + "learning_rate": 0.003, + "loss": 4.2122, + "step": 2583 + }, + { + "epoch": 0.02584, + "grad_norm": 0.7842058343487789, + "learning_rate": 0.003, + "loss": 4.2267, + "step": 2584 + }, + { + "epoch": 0.02585, + "grad_norm": 0.8326334504065149, + "learning_rate": 0.003, + "loss": 4.2137, + "step": 2585 + }, + { + "epoch": 0.02586, + "grad_norm": 0.7754616837582553, + "learning_rate": 0.003, + "loss": 4.2322, + "step": 2586 + }, + { + "epoch": 0.02587, + "grad_norm": 0.6511240448731711, + "learning_rate": 0.003, + "loss": 4.2206, + "step": 2587 + }, + { + "epoch": 0.02588, + "grad_norm": 0.6418349994998516, + "learning_rate": 0.003, + "loss": 4.2155, + "step": 2588 + }, + { + "epoch": 0.02589, + "grad_norm": 0.6950697428484538, + "learning_rate": 0.003, + "loss": 4.2254, + "step": 2589 + }, + { + "epoch": 0.0259, + "grad_norm": 0.7453523656170742, + "learning_rate": 0.003, + "loss": 4.2359, + "step": 2590 + }, + { + "epoch": 0.02591, + "grad_norm": 0.6556652428839654, + "learning_rate": 0.003, + "loss": 4.1695, + "step": 2591 + }, + { + "epoch": 0.02592, + "grad_norm": 0.5716997523979154, + "learning_rate": 0.003, + "loss": 4.2113, + "step": 2592 + }, + { + "epoch": 0.02593, + "grad_norm": 0.5895740446059741, + "learning_rate": 0.003, + "loss": 4.2131, + "step": 2593 + }, + { + "epoch": 0.02594, + "grad_norm": 0.6861270884763415, + "learning_rate": 0.003, + "loss": 4.2385, + "step": 2594 + }, + { + "epoch": 0.02595, + "grad_norm": 0.6039685752573255, + "learning_rate": 0.003, + "loss": 4.1847, + "step": 2595 + }, + { + "epoch": 0.02596, + "grad_norm": 0.5737417856631055, + "learning_rate": 0.003, + "loss": 4.2245, + "step": 2596 + }, + { + "epoch": 0.02597, + "grad_norm": 0.5807882188149026, + "learning_rate": 0.003, + "loss": 4.1916, + "step": 2597 + }, + { + "epoch": 0.02598, + "grad_norm": 0.5478759076616263, + "learning_rate": 0.003, + "loss": 4.2296, + "step": 2598 + }, + { + "epoch": 0.02599, + "grad_norm": 0.5274069809990996, + "learning_rate": 0.003, + "loss": 4.2218, + "step": 2599 + }, + { + "epoch": 0.026, + "grad_norm": 0.5489291707939286, + "learning_rate": 0.003, + "loss": 4.2225, + "step": 2600 + }, + { + "epoch": 0.02601, + "grad_norm": 0.5075235418141223, + "learning_rate": 0.003, + "loss": 4.2266, + "step": 2601 + }, + { + "epoch": 0.02602, + "grad_norm": 0.5092543256680011, + "learning_rate": 0.003, + "loss": 4.1993, + "step": 2602 + }, + { + "epoch": 0.02603, + "grad_norm": 0.48420449669893695, + "learning_rate": 0.003, + "loss": 4.1912, + "step": 2603 + }, + { + "epoch": 0.02604, + "grad_norm": 0.4845552481378335, + "learning_rate": 0.003, + "loss": 4.181, + "step": 2604 + }, + { + "epoch": 0.02605, + "grad_norm": 0.522772142438526, + "learning_rate": 0.003, + "loss": 4.1993, + "step": 2605 + }, + { + "epoch": 0.02606, + "grad_norm": 0.49986531502998316, + "learning_rate": 0.003, + "loss": 4.2158, + "step": 2606 + }, + { + "epoch": 0.02607, + "grad_norm": 0.4992114205626333, + "learning_rate": 0.003, + "loss": 4.2071, + "step": 2607 + }, + { + "epoch": 0.02608, + "grad_norm": 0.516906776877324, + "learning_rate": 0.003, + "loss": 4.1893, + "step": 2608 + }, + { + "epoch": 0.02609, + "grad_norm": 0.5823377881437489, + "learning_rate": 0.003, + "loss": 4.2336, + "step": 2609 + }, + { + "epoch": 0.0261, + "grad_norm": 0.6432677415233223, + "learning_rate": 0.003, + "loss": 4.1868, + "step": 2610 + }, + { + "epoch": 0.02611, + "grad_norm": 0.6818797064218945, + "learning_rate": 0.003, + "loss": 4.1789, + "step": 2611 + }, + { + "epoch": 0.02612, + "grad_norm": 0.6622646028172395, + "learning_rate": 0.003, + "loss": 4.2188, + "step": 2612 + }, + { + "epoch": 0.02613, + "grad_norm": 0.6111717401030455, + "learning_rate": 0.003, + "loss": 4.2101, + "step": 2613 + }, + { + "epoch": 0.02614, + "grad_norm": 0.5350928499489108, + "learning_rate": 0.003, + "loss": 4.1863, + "step": 2614 + }, + { + "epoch": 0.02615, + "grad_norm": 0.4811589002895373, + "learning_rate": 0.003, + "loss": 4.1735, + "step": 2615 + }, + { + "epoch": 0.02616, + "grad_norm": 0.4870958333875311, + "learning_rate": 0.003, + "loss": 4.1868, + "step": 2616 + }, + { + "epoch": 0.02617, + "grad_norm": 0.48293626841319026, + "learning_rate": 0.003, + "loss": 4.1753, + "step": 2617 + }, + { + "epoch": 0.02618, + "grad_norm": 0.5217244185547227, + "learning_rate": 0.003, + "loss": 4.2041, + "step": 2618 + }, + { + "epoch": 0.02619, + "grad_norm": 0.5234547384625777, + "learning_rate": 0.003, + "loss": 4.2179, + "step": 2619 + }, + { + "epoch": 0.0262, + "grad_norm": 0.4734488648814552, + "learning_rate": 0.003, + "loss": 4.1769, + "step": 2620 + }, + { + "epoch": 0.02621, + "grad_norm": 0.4072710001568894, + "learning_rate": 0.003, + "loss": 4.2023, + "step": 2621 + }, + { + "epoch": 0.02622, + "grad_norm": 0.3670914624450667, + "learning_rate": 0.003, + "loss": 4.1997, + "step": 2622 + }, + { + "epoch": 0.02623, + "grad_norm": 0.3907209927814348, + "learning_rate": 0.003, + "loss": 4.2044, + "step": 2623 + }, + { + "epoch": 0.02624, + "grad_norm": 0.4112397678843122, + "learning_rate": 0.003, + "loss": 4.2395, + "step": 2624 + }, + { + "epoch": 0.02625, + "grad_norm": 0.4373318792363482, + "learning_rate": 0.003, + "loss": 4.2339, + "step": 2625 + }, + { + "epoch": 0.02626, + "grad_norm": 0.43765342876243357, + "learning_rate": 0.003, + "loss": 4.1682, + "step": 2626 + }, + { + "epoch": 0.02627, + "grad_norm": 0.44112134115596174, + "learning_rate": 0.003, + "loss": 4.1845, + "step": 2627 + }, + { + "epoch": 0.02628, + "grad_norm": 0.5820511792087758, + "learning_rate": 0.003, + "loss": 4.2196, + "step": 2628 + }, + { + "epoch": 0.02629, + "grad_norm": 0.6987525856490595, + "learning_rate": 0.003, + "loss": 4.1665, + "step": 2629 + }, + { + "epoch": 0.0263, + "grad_norm": 0.8227125293487279, + "learning_rate": 0.003, + "loss": 4.2044, + "step": 2630 + }, + { + "epoch": 0.02631, + "grad_norm": 0.8558111239802438, + "learning_rate": 0.003, + "loss": 4.2316, + "step": 2631 + }, + { + "epoch": 0.02632, + "grad_norm": 0.7416342423752187, + "learning_rate": 0.003, + "loss": 4.1891, + "step": 2632 + }, + { + "epoch": 0.02633, + "grad_norm": 0.7123350797241652, + "learning_rate": 0.003, + "loss": 4.2064, + "step": 2633 + }, + { + "epoch": 0.02634, + "grad_norm": 0.7492282422336083, + "learning_rate": 0.003, + "loss": 4.2291, + "step": 2634 + }, + { + "epoch": 0.02635, + "grad_norm": 0.9073437715289041, + "learning_rate": 0.003, + "loss": 4.2382, + "step": 2635 + }, + { + "epoch": 0.02636, + "grad_norm": 0.8986238889024204, + "learning_rate": 0.003, + "loss": 4.2298, + "step": 2636 + }, + { + "epoch": 0.02637, + "grad_norm": 0.803485832317335, + "learning_rate": 0.003, + "loss": 4.2558, + "step": 2637 + }, + { + "epoch": 0.02638, + "grad_norm": 0.790197716438678, + "learning_rate": 0.003, + "loss": 4.2252, + "step": 2638 + }, + { + "epoch": 0.02639, + "grad_norm": 0.6490246383904035, + "learning_rate": 0.003, + "loss": 4.2065, + "step": 2639 + }, + { + "epoch": 0.0264, + "grad_norm": 0.6271932479026731, + "learning_rate": 0.003, + "loss": 4.2032, + "step": 2640 + }, + { + "epoch": 0.02641, + "grad_norm": 0.5216470184091793, + "learning_rate": 0.003, + "loss": 4.212, + "step": 2641 + }, + { + "epoch": 0.02642, + "grad_norm": 0.5340137157215237, + "learning_rate": 0.003, + "loss": 4.2208, + "step": 2642 + }, + { + "epoch": 0.02643, + "grad_norm": 0.48600536528948673, + "learning_rate": 0.003, + "loss": 4.2039, + "step": 2643 + }, + { + "epoch": 0.02644, + "grad_norm": 0.4913843517241883, + "learning_rate": 0.003, + "loss": 4.2177, + "step": 2644 + }, + { + "epoch": 0.02645, + "grad_norm": 0.5723820549842618, + "learning_rate": 0.003, + "loss": 4.2298, + "step": 2645 + }, + { + "epoch": 0.02646, + "grad_norm": 0.6080093721456342, + "learning_rate": 0.003, + "loss": 4.1979, + "step": 2646 + }, + { + "epoch": 0.02647, + "grad_norm": 0.6222106205330706, + "learning_rate": 0.003, + "loss": 4.2212, + "step": 2647 + }, + { + "epoch": 0.02648, + "grad_norm": 0.5217769121416035, + "learning_rate": 0.003, + "loss": 4.2349, + "step": 2648 + }, + { + "epoch": 0.02649, + "grad_norm": 0.4227997429869736, + "learning_rate": 0.003, + "loss": 4.2183, + "step": 2649 + }, + { + "epoch": 0.0265, + "grad_norm": 0.47327414101860077, + "learning_rate": 0.003, + "loss": 4.178, + "step": 2650 + }, + { + "epoch": 0.02651, + "grad_norm": 0.5099068125596607, + "learning_rate": 0.003, + "loss": 4.1959, + "step": 2651 + }, + { + "epoch": 0.02652, + "grad_norm": 0.60276863782341, + "learning_rate": 0.003, + "loss": 4.2278, + "step": 2652 + }, + { + "epoch": 0.02653, + "grad_norm": 0.6740196425733591, + "learning_rate": 0.003, + "loss": 4.1869, + "step": 2653 + }, + { + "epoch": 0.02654, + "grad_norm": 0.6718723086436392, + "learning_rate": 0.003, + "loss": 4.2008, + "step": 2654 + }, + { + "epoch": 0.02655, + "grad_norm": 0.6985692134702927, + "learning_rate": 0.003, + "loss": 4.21, + "step": 2655 + }, + { + "epoch": 0.02656, + "grad_norm": 0.7105425653160966, + "learning_rate": 0.003, + "loss": 4.1838, + "step": 2656 + }, + { + "epoch": 0.02657, + "grad_norm": 0.6933600976197403, + "learning_rate": 0.003, + "loss": 4.2156, + "step": 2657 + }, + { + "epoch": 0.02658, + "grad_norm": 0.6544303516003507, + "learning_rate": 0.003, + "loss": 4.2197, + "step": 2658 + }, + { + "epoch": 0.02659, + "grad_norm": 0.5910001636604436, + "learning_rate": 0.003, + "loss": 4.2113, + "step": 2659 + }, + { + "epoch": 0.0266, + "grad_norm": 0.5163250162883284, + "learning_rate": 0.003, + "loss": 4.1915, + "step": 2660 + }, + { + "epoch": 0.02661, + "grad_norm": 0.5302434720729938, + "learning_rate": 0.003, + "loss": 4.2129, + "step": 2661 + }, + { + "epoch": 0.02662, + "grad_norm": 0.4750913186060552, + "learning_rate": 0.003, + "loss": 4.1991, + "step": 2662 + }, + { + "epoch": 0.02663, + "grad_norm": 0.4491722077606405, + "learning_rate": 0.003, + "loss": 4.1805, + "step": 2663 + }, + { + "epoch": 0.02664, + "grad_norm": 0.42258758049488826, + "learning_rate": 0.003, + "loss": 4.1803, + "step": 2664 + }, + { + "epoch": 0.02665, + "grad_norm": 0.41774121759742056, + "learning_rate": 0.003, + "loss": 4.2287, + "step": 2665 + }, + { + "epoch": 0.02666, + "grad_norm": 0.43325477990837064, + "learning_rate": 0.003, + "loss": 4.2184, + "step": 2666 + }, + { + "epoch": 0.02667, + "grad_norm": 0.4508456123093455, + "learning_rate": 0.003, + "loss": 4.1974, + "step": 2667 + }, + { + "epoch": 0.02668, + "grad_norm": 0.46262752082913233, + "learning_rate": 0.003, + "loss": 4.1674, + "step": 2668 + }, + { + "epoch": 0.02669, + "grad_norm": 0.42577584585471717, + "learning_rate": 0.003, + "loss": 4.1867, + "step": 2669 + }, + { + "epoch": 0.0267, + "grad_norm": 0.404720023333166, + "learning_rate": 0.003, + "loss": 4.1856, + "step": 2670 + }, + { + "epoch": 0.02671, + "grad_norm": 0.3942334538580407, + "learning_rate": 0.003, + "loss": 4.2006, + "step": 2671 + }, + { + "epoch": 0.02672, + "grad_norm": 0.40993974344347783, + "learning_rate": 0.003, + "loss": 4.1947, + "step": 2672 + }, + { + "epoch": 0.02673, + "grad_norm": 0.4616934932283336, + "learning_rate": 0.003, + "loss": 4.1918, + "step": 2673 + }, + { + "epoch": 0.02674, + "grad_norm": 0.5514329672225634, + "learning_rate": 0.003, + "loss": 4.1647, + "step": 2674 + }, + { + "epoch": 0.02675, + "grad_norm": 0.5466818103576505, + "learning_rate": 0.003, + "loss": 4.1643, + "step": 2675 + }, + { + "epoch": 0.02676, + "grad_norm": 0.6935187296359256, + "learning_rate": 0.003, + "loss": 4.1899, + "step": 2676 + }, + { + "epoch": 0.02677, + "grad_norm": 0.8598433315495629, + "learning_rate": 0.003, + "loss": 4.231, + "step": 2677 + }, + { + "epoch": 0.02678, + "grad_norm": 0.8883830728999605, + "learning_rate": 0.003, + "loss": 4.2344, + "step": 2678 + }, + { + "epoch": 0.02679, + "grad_norm": 0.6405381161924845, + "learning_rate": 0.003, + "loss": 4.1941, + "step": 2679 + }, + { + "epoch": 0.0268, + "grad_norm": 0.6915639860426047, + "learning_rate": 0.003, + "loss": 4.2139, + "step": 2680 + }, + { + "epoch": 0.02681, + "grad_norm": 0.5927165760939286, + "learning_rate": 0.003, + "loss": 4.2278, + "step": 2681 + }, + { + "epoch": 0.02682, + "grad_norm": 0.5745893208237367, + "learning_rate": 0.003, + "loss": 4.2044, + "step": 2682 + }, + { + "epoch": 0.02683, + "grad_norm": 0.5931786049326642, + "learning_rate": 0.003, + "loss": 4.2269, + "step": 2683 + }, + { + "epoch": 0.02684, + "grad_norm": 0.5656789702546796, + "learning_rate": 0.003, + "loss": 4.1996, + "step": 2684 + }, + { + "epoch": 0.02685, + "grad_norm": 0.6294556799351446, + "learning_rate": 0.003, + "loss": 4.211, + "step": 2685 + }, + { + "epoch": 0.02686, + "grad_norm": 0.5832268033312072, + "learning_rate": 0.003, + "loss": 4.1905, + "step": 2686 + }, + { + "epoch": 0.02687, + "grad_norm": 0.570290226028237, + "learning_rate": 0.003, + "loss": 4.1996, + "step": 2687 + }, + { + "epoch": 0.02688, + "grad_norm": 0.5455818208368779, + "learning_rate": 0.003, + "loss": 4.1744, + "step": 2688 + }, + { + "epoch": 0.02689, + "grad_norm": 0.5599812876010125, + "learning_rate": 0.003, + "loss": 4.2088, + "step": 2689 + }, + { + "epoch": 0.0269, + "grad_norm": 0.5474516018001153, + "learning_rate": 0.003, + "loss": 4.2129, + "step": 2690 + }, + { + "epoch": 0.02691, + "grad_norm": 0.5566067071138427, + "learning_rate": 0.003, + "loss": 4.2043, + "step": 2691 + }, + { + "epoch": 0.02692, + "grad_norm": 0.526509280614571, + "learning_rate": 0.003, + "loss": 4.2054, + "step": 2692 + }, + { + "epoch": 0.02693, + "grad_norm": 0.4827516603970495, + "learning_rate": 0.003, + "loss": 4.218, + "step": 2693 + }, + { + "epoch": 0.02694, + "grad_norm": 0.46801137768639334, + "learning_rate": 0.003, + "loss": 4.1783, + "step": 2694 + }, + { + "epoch": 0.02695, + "grad_norm": 0.4878539547841592, + "learning_rate": 0.003, + "loss": 4.1935, + "step": 2695 + }, + { + "epoch": 0.02696, + "grad_norm": 0.47854515525549607, + "learning_rate": 0.003, + "loss": 4.1948, + "step": 2696 + }, + { + "epoch": 0.02697, + "grad_norm": 0.5065481594378214, + "learning_rate": 0.003, + "loss": 4.2065, + "step": 2697 + }, + { + "epoch": 0.02698, + "grad_norm": 0.5248263427239105, + "learning_rate": 0.003, + "loss": 4.1898, + "step": 2698 + }, + { + "epoch": 0.02699, + "grad_norm": 0.5277451425908661, + "learning_rate": 0.003, + "loss": 4.2105, + "step": 2699 + }, + { + "epoch": 0.027, + "grad_norm": 0.5538062675395776, + "learning_rate": 0.003, + "loss": 4.1806, + "step": 2700 + }, + { + "epoch": 0.02701, + "grad_norm": 0.6185989339314542, + "learning_rate": 0.003, + "loss": 4.173, + "step": 2701 + }, + { + "epoch": 0.02702, + "grad_norm": 0.5874971422385314, + "learning_rate": 0.003, + "loss": 4.225, + "step": 2702 + }, + { + "epoch": 0.02703, + "grad_norm": 0.5463427926178012, + "learning_rate": 0.003, + "loss": 4.2202, + "step": 2703 + }, + { + "epoch": 0.02704, + "grad_norm": 0.5672786289651166, + "learning_rate": 0.003, + "loss": 4.1641, + "step": 2704 + }, + { + "epoch": 0.02705, + "grad_norm": 0.6256411921927717, + "learning_rate": 0.003, + "loss": 4.2194, + "step": 2705 + }, + { + "epoch": 0.02706, + "grad_norm": 0.652684226752671, + "learning_rate": 0.003, + "loss": 4.1841, + "step": 2706 + }, + { + "epoch": 0.02707, + "grad_norm": 0.6029115328911401, + "learning_rate": 0.003, + "loss": 4.1842, + "step": 2707 + }, + { + "epoch": 0.02708, + "grad_norm": 0.5287197350990817, + "learning_rate": 0.003, + "loss": 4.2087, + "step": 2708 + }, + { + "epoch": 0.02709, + "grad_norm": 0.6492025140814462, + "learning_rate": 0.003, + "loss": 4.1853, + "step": 2709 + }, + { + "epoch": 0.0271, + "grad_norm": 0.6051042816144401, + "learning_rate": 0.003, + "loss": 4.1894, + "step": 2710 + }, + { + "epoch": 0.02711, + "grad_norm": 0.5747478778857192, + "learning_rate": 0.003, + "loss": 4.1754, + "step": 2711 + }, + { + "epoch": 0.02712, + "grad_norm": 0.5615755138550911, + "learning_rate": 0.003, + "loss": 4.1868, + "step": 2712 + }, + { + "epoch": 0.02713, + "grad_norm": 0.5892906666649981, + "learning_rate": 0.003, + "loss": 4.1834, + "step": 2713 + }, + { + "epoch": 0.02714, + "grad_norm": 0.6013481891384199, + "learning_rate": 0.003, + "loss": 4.1837, + "step": 2714 + }, + { + "epoch": 0.02715, + "grad_norm": 0.6656155771795913, + "learning_rate": 0.003, + "loss": 4.1807, + "step": 2715 + }, + { + "epoch": 0.02716, + "grad_norm": 0.8931240767996229, + "learning_rate": 0.003, + "loss": 4.2057, + "step": 2716 + }, + { + "epoch": 0.02717, + "grad_norm": 0.9885029624018516, + "learning_rate": 0.003, + "loss": 4.2196, + "step": 2717 + }, + { + "epoch": 0.02718, + "grad_norm": 0.8754295759055237, + "learning_rate": 0.003, + "loss": 4.2405, + "step": 2718 + }, + { + "epoch": 0.02719, + "grad_norm": 0.7774225786110023, + "learning_rate": 0.003, + "loss": 4.2179, + "step": 2719 + }, + { + "epoch": 0.0272, + "grad_norm": 0.6491659668608393, + "learning_rate": 0.003, + "loss": 4.2007, + "step": 2720 + }, + { + "epoch": 0.02721, + "grad_norm": 0.6343965078219608, + "learning_rate": 0.003, + "loss": 4.2145, + "step": 2721 + }, + { + "epoch": 0.02722, + "grad_norm": 0.6251153466031798, + "learning_rate": 0.003, + "loss": 4.2101, + "step": 2722 + }, + { + "epoch": 0.02723, + "grad_norm": 0.6748255813851252, + "learning_rate": 0.003, + "loss": 4.2215, + "step": 2723 + }, + { + "epoch": 0.02724, + "grad_norm": 0.6864377875600585, + "learning_rate": 0.003, + "loss": 4.2222, + "step": 2724 + }, + { + "epoch": 0.02725, + "grad_norm": 0.657540480175467, + "learning_rate": 0.003, + "loss": 4.2162, + "step": 2725 + }, + { + "epoch": 0.02726, + "grad_norm": 0.67217283621503, + "learning_rate": 0.003, + "loss": 4.1914, + "step": 2726 + }, + { + "epoch": 0.02727, + "grad_norm": 0.7144546093311819, + "learning_rate": 0.003, + "loss": 4.2253, + "step": 2727 + }, + { + "epoch": 0.02728, + "grad_norm": 0.6685628291749064, + "learning_rate": 0.003, + "loss": 4.2062, + "step": 2728 + }, + { + "epoch": 0.02729, + "grad_norm": 0.6958710627910398, + "learning_rate": 0.003, + "loss": 4.2431, + "step": 2729 + }, + { + "epoch": 0.0273, + "grad_norm": 0.7029911290820989, + "learning_rate": 0.003, + "loss": 4.2118, + "step": 2730 + }, + { + "epoch": 0.02731, + "grad_norm": 0.7388699066611659, + "learning_rate": 0.003, + "loss": 4.2131, + "step": 2731 + }, + { + "epoch": 0.02732, + "grad_norm": 0.6457150813790012, + "learning_rate": 0.003, + "loss": 4.2055, + "step": 2732 + }, + { + "epoch": 0.02733, + "grad_norm": 0.42245973261823044, + "learning_rate": 0.003, + "loss": 4.225, + "step": 2733 + }, + { + "epoch": 0.02734, + "grad_norm": 0.4568223524843039, + "learning_rate": 0.003, + "loss": 4.1769, + "step": 2734 + }, + { + "epoch": 0.02735, + "grad_norm": 0.46161311769792607, + "learning_rate": 0.003, + "loss": 4.2213, + "step": 2735 + }, + { + "epoch": 0.02736, + "grad_norm": 0.43828909949858114, + "learning_rate": 0.003, + "loss": 4.1921, + "step": 2736 + }, + { + "epoch": 0.02737, + "grad_norm": 0.44381003049579976, + "learning_rate": 0.003, + "loss": 4.1831, + "step": 2737 + }, + { + "epoch": 0.02738, + "grad_norm": 0.40770566309449235, + "learning_rate": 0.003, + "loss": 4.207, + "step": 2738 + }, + { + "epoch": 0.02739, + "grad_norm": 0.3980139287869944, + "learning_rate": 0.003, + "loss": 4.1947, + "step": 2739 + }, + { + "epoch": 0.0274, + "grad_norm": 0.4103938581926708, + "learning_rate": 0.003, + "loss": 4.1853, + "step": 2740 + }, + { + "epoch": 0.02741, + "grad_norm": 0.3977644284371821, + "learning_rate": 0.003, + "loss": 4.1971, + "step": 2741 + }, + { + "epoch": 0.02742, + "grad_norm": 0.3910236660359437, + "learning_rate": 0.003, + "loss": 4.1829, + "step": 2742 + }, + { + "epoch": 0.02743, + "grad_norm": 0.37057712717675134, + "learning_rate": 0.003, + "loss": 4.2028, + "step": 2743 + }, + { + "epoch": 0.02744, + "grad_norm": 0.39171990335728823, + "learning_rate": 0.003, + "loss": 4.1862, + "step": 2744 + }, + { + "epoch": 0.02745, + "grad_norm": 0.3931440185763024, + "learning_rate": 0.003, + "loss": 4.1978, + "step": 2745 + }, + { + "epoch": 0.02746, + "grad_norm": 0.4493443882352147, + "learning_rate": 0.003, + "loss": 4.1722, + "step": 2746 + }, + { + "epoch": 0.02747, + "grad_norm": 0.5239427386961047, + "learning_rate": 0.003, + "loss": 4.2084, + "step": 2747 + }, + { + "epoch": 0.02748, + "grad_norm": 0.48560097013750286, + "learning_rate": 0.003, + "loss": 4.16, + "step": 2748 + }, + { + "epoch": 0.02749, + "grad_norm": 0.4388250470896872, + "learning_rate": 0.003, + "loss": 4.1614, + "step": 2749 + }, + { + "epoch": 0.0275, + "grad_norm": 0.425974848346712, + "learning_rate": 0.003, + "loss": 4.1976, + "step": 2750 + }, + { + "epoch": 0.02751, + "grad_norm": 0.4487191491618812, + "learning_rate": 0.003, + "loss": 4.2102, + "step": 2751 + }, + { + "epoch": 0.02752, + "grad_norm": 0.5262082176933003, + "learning_rate": 0.003, + "loss": 4.1993, + "step": 2752 + }, + { + "epoch": 0.02753, + "grad_norm": 0.5624164107816553, + "learning_rate": 0.003, + "loss": 4.1806, + "step": 2753 + }, + { + "epoch": 0.02754, + "grad_norm": 0.6290188699871871, + "learning_rate": 0.003, + "loss": 4.1674, + "step": 2754 + }, + { + "epoch": 0.02755, + "grad_norm": 0.6718718115523771, + "learning_rate": 0.003, + "loss": 4.1724, + "step": 2755 + }, + { + "epoch": 0.02756, + "grad_norm": 0.689725298181599, + "learning_rate": 0.003, + "loss": 4.2042, + "step": 2756 + }, + { + "epoch": 0.02757, + "grad_norm": 0.7456661709777471, + "learning_rate": 0.003, + "loss": 4.2182, + "step": 2757 + }, + { + "epoch": 0.02758, + "grad_norm": 0.7434755613308037, + "learning_rate": 0.003, + "loss": 4.175, + "step": 2758 + }, + { + "epoch": 0.02759, + "grad_norm": 0.6285033589958848, + "learning_rate": 0.003, + "loss": 4.1982, + "step": 2759 + }, + { + "epoch": 0.0276, + "grad_norm": 0.6775730198112929, + "learning_rate": 0.003, + "loss": 4.2094, + "step": 2760 + }, + { + "epoch": 0.02761, + "grad_norm": 0.6910341516486844, + "learning_rate": 0.003, + "loss": 4.2268, + "step": 2761 + }, + { + "epoch": 0.02762, + "grad_norm": 0.6650852460123947, + "learning_rate": 0.003, + "loss": 4.1938, + "step": 2762 + }, + { + "epoch": 0.02763, + "grad_norm": 0.6011271334026552, + "learning_rate": 0.003, + "loss": 4.2029, + "step": 2763 + }, + { + "epoch": 0.02764, + "grad_norm": 0.5886973411048619, + "learning_rate": 0.003, + "loss": 4.1962, + "step": 2764 + }, + { + "epoch": 0.02765, + "grad_norm": 0.6682064585302908, + "learning_rate": 0.003, + "loss": 4.2039, + "step": 2765 + }, + { + "epoch": 0.02766, + "grad_norm": 0.7004868212188018, + "learning_rate": 0.003, + "loss": 4.1783, + "step": 2766 + }, + { + "epoch": 0.02767, + "grad_norm": 0.6391373162549133, + "learning_rate": 0.003, + "loss": 4.2317, + "step": 2767 + }, + { + "epoch": 0.02768, + "grad_norm": 0.5711886608834656, + "learning_rate": 0.003, + "loss": 4.1613, + "step": 2768 + }, + { + "epoch": 0.02769, + "grad_norm": 0.6028604983542873, + "learning_rate": 0.003, + "loss": 4.2212, + "step": 2769 + }, + { + "epoch": 0.0277, + "grad_norm": 0.5836613470106289, + "learning_rate": 0.003, + "loss": 4.1882, + "step": 2770 + }, + { + "epoch": 0.02771, + "grad_norm": 0.5619610082417591, + "learning_rate": 0.003, + "loss": 4.174, + "step": 2771 + }, + { + "epoch": 0.02772, + "grad_norm": 0.55339038708748, + "learning_rate": 0.003, + "loss": 4.1688, + "step": 2772 + }, + { + "epoch": 0.02773, + "grad_norm": 0.5034136342234373, + "learning_rate": 0.003, + "loss": 4.2003, + "step": 2773 + }, + { + "epoch": 0.02774, + "grad_norm": 0.5193760267646912, + "learning_rate": 0.003, + "loss": 4.2225, + "step": 2774 + }, + { + "epoch": 0.02775, + "grad_norm": 0.5211288768219952, + "learning_rate": 0.003, + "loss": 4.1899, + "step": 2775 + }, + { + "epoch": 0.02776, + "grad_norm": 0.4784203907496932, + "learning_rate": 0.003, + "loss": 4.1844, + "step": 2776 + }, + { + "epoch": 0.02777, + "grad_norm": 0.48277039376268643, + "learning_rate": 0.003, + "loss": 4.1625, + "step": 2777 + }, + { + "epoch": 0.02778, + "grad_norm": 0.5520225977650862, + "learning_rate": 0.003, + "loss": 4.2068, + "step": 2778 + }, + { + "epoch": 0.02779, + "grad_norm": 0.676790078006391, + "learning_rate": 0.003, + "loss": 4.1825, + "step": 2779 + }, + { + "epoch": 0.0278, + "grad_norm": 0.7848726112221234, + "learning_rate": 0.003, + "loss": 4.1832, + "step": 2780 + }, + { + "epoch": 0.02781, + "grad_norm": 0.7347482785122091, + "learning_rate": 0.003, + "loss": 4.1673, + "step": 2781 + }, + { + "epoch": 0.02782, + "grad_norm": 0.6054417831566177, + "learning_rate": 0.003, + "loss": 4.1891, + "step": 2782 + }, + { + "epoch": 0.02783, + "grad_norm": 0.5849343545068603, + "learning_rate": 0.003, + "loss": 4.1925, + "step": 2783 + }, + { + "epoch": 0.02784, + "grad_norm": 0.6041156935596753, + "learning_rate": 0.003, + "loss": 4.1877, + "step": 2784 + }, + { + "epoch": 0.02785, + "grad_norm": 0.595867835436581, + "learning_rate": 0.003, + "loss": 4.2062, + "step": 2785 + }, + { + "epoch": 0.02786, + "grad_norm": 0.6928930555141227, + "learning_rate": 0.003, + "loss": 4.2052, + "step": 2786 + }, + { + "epoch": 0.02787, + "grad_norm": 0.7406479460311315, + "learning_rate": 0.003, + "loss": 4.2115, + "step": 2787 + }, + { + "epoch": 0.02788, + "grad_norm": 0.6940976950244777, + "learning_rate": 0.003, + "loss": 4.1982, + "step": 2788 + }, + { + "epoch": 0.02789, + "grad_norm": 0.6273541949163579, + "learning_rate": 0.003, + "loss": 4.2011, + "step": 2789 + }, + { + "epoch": 0.0279, + "grad_norm": 0.5979480136249415, + "learning_rate": 0.003, + "loss": 4.1745, + "step": 2790 + }, + { + "epoch": 0.02791, + "grad_norm": 0.5594196614238619, + "learning_rate": 0.003, + "loss": 4.1864, + "step": 2791 + }, + { + "epoch": 0.02792, + "grad_norm": 0.5206581117733323, + "learning_rate": 0.003, + "loss": 4.2009, + "step": 2792 + }, + { + "epoch": 0.02793, + "grad_norm": 0.5275957045486279, + "learning_rate": 0.003, + "loss": 4.1994, + "step": 2793 + }, + { + "epoch": 0.02794, + "grad_norm": 0.524155250681782, + "learning_rate": 0.003, + "loss": 4.194, + "step": 2794 + }, + { + "epoch": 0.02795, + "grad_norm": 0.5721629122704371, + "learning_rate": 0.003, + "loss": 4.2082, + "step": 2795 + }, + { + "epoch": 0.02796, + "grad_norm": 0.6009605504824468, + "learning_rate": 0.003, + "loss": 4.2021, + "step": 2796 + }, + { + "epoch": 0.02797, + "grad_norm": 0.5762272128929422, + "learning_rate": 0.003, + "loss": 4.2131, + "step": 2797 + }, + { + "epoch": 0.02798, + "grad_norm": 0.5678013400030233, + "learning_rate": 0.003, + "loss": 4.2087, + "step": 2798 + }, + { + "epoch": 0.02799, + "grad_norm": 0.5466966091962687, + "learning_rate": 0.003, + "loss": 4.1921, + "step": 2799 + }, + { + "epoch": 0.028, + "grad_norm": 0.5426896151230969, + "learning_rate": 0.003, + "loss": 4.2039, + "step": 2800 + }, + { + "epoch": 0.02801, + "grad_norm": 0.5309488273760117, + "learning_rate": 0.003, + "loss": 4.1799, + "step": 2801 + }, + { + "epoch": 0.02802, + "grad_norm": 0.5742220089169363, + "learning_rate": 0.003, + "loss": 4.1824, + "step": 2802 + }, + { + "epoch": 0.02803, + "grad_norm": 0.7893252753789815, + "learning_rate": 0.003, + "loss": 4.2101, + "step": 2803 + }, + { + "epoch": 0.02804, + "grad_norm": 1.052487841789281, + "learning_rate": 0.003, + "loss": 4.2001, + "step": 2804 + }, + { + "epoch": 0.02805, + "grad_norm": 0.8655535125529751, + "learning_rate": 0.003, + "loss": 4.2346, + "step": 2805 + }, + { + "epoch": 0.02806, + "grad_norm": 0.6475434652146125, + "learning_rate": 0.003, + "loss": 4.202, + "step": 2806 + }, + { + "epoch": 0.02807, + "grad_norm": 0.5991110542030375, + "learning_rate": 0.003, + "loss": 4.2062, + "step": 2807 + }, + { + "epoch": 0.02808, + "grad_norm": 0.5007983686795169, + "learning_rate": 0.003, + "loss": 4.1818, + "step": 2808 + }, + { + "epoch": 0.02809, + "grad_norm": 0.49095808154292164, + "learning_rate": 0.003, + "loss": 4.2217, + "step": 2809 + }, + { + "epoch": 0.0281, + "grad_norm": 0.4819118161545627, + "learning_rate": 0.003, + "loss": 4.1742, + "step": 2810 + }, + { + "epoch": 0.02811, + "grad_norm": 0.44009677007510295, + "learning_rate": 0.003, + "loss": 4.1812, + "step": 2811 + }, + { + "epoch": 0.02812, + "grad_norm": 0.4080215487522985, + "learning_rate": 0.003, + "loss": 4.1953, + "step": 2812 + }, + { + "epoch": 0.02813, + "grad_norm": 0.3937509668583135, + "learning_rate": 0.003, + "loss": 4.1839, + "step": 2813 + }, + { + "epoch": 0.02814, + "grad_norm": 0.34050782014520087, + "learning_rate": 0.003, + "loss": 4.1814, + "step": 2814 + }, + { + "epoch": 0.02815, + "grad_norm": 0.32268087418962427, + "learning_rate": 0.003, + "loss": 4.2181, + "step": 2815 + }, + { + "epoch": 0.02816, + "grad_norm": 0.3255642062500674, + "learning_rate": 0.003, + "loss": 4.1944, + "step": 2816 + }, + { + "epoch": 0.02817, + "grad_norm": 0.3381409922347926, + "learning_rate": 0.003, + "loss": 4.1508, + "step": 2817 + }, + { + "epoch": 0.02818, + "grad_norm": 0.38168796766705765, + "learning_rate": 0.003, + "loss": 4.1629, + "step": 2818 + }, + { + "epoch": 0.02819, + "grad_norm": 0.4325061454583649, + "learning_rate": 0.003, + "loss": 4.1913, + "step": 2819 + }, + { + "epoch": 0.0282, + "grad_norm": 0.4532493695937539, + "learning_rate": 0.003, + "loss": 4.1852, + "step": 2820 + }, + { + "epoch": 0.02821, + "grad_norm": 0.47151197099161557, + "learning_rate": 0.003, + "loss": 4.2029, + "step": 2821 + }, + { + "epoch": 0.02822, + "grad_norm": 0.550967405378687, + "learning_rate": 0.003, + "loss": 4.1762, + "step": 2822 + }, + { + "epoch": 0.02823, + "grad_norm": 0.5888233523434347, + "learning_rate": 0.003, + "loss": 4.2039, + "step": 2823 + }, + { + "epoch": 0.02824, + "grad_norm": 0.5616734570991271, + "learning_rate": 0.003, + "loss": 4.1907, + "step": 2824 + }, + { + "epoch": 0.02825, + "grad_norm": 0.4685382550291958, + "learning_rate": 0.003, + "loss": 4.1898, + "step": 2825 + }, + { + "epoch": 0.02826, + "grad_norm": 0.3984745923734355, + "learning_rate": 0.003, + "loss": 4.1666, + "step": 2826 + }, + { + "epoch": 0.02827, + "grad_norm": 0.4466687377830192, + "learning_rate": 0.003, + "loss": 4.153, + "step": 2827 + }, + { + "epoch": 0.02828, + "grad_norm": 0.48090906385724086, + "learning_rate": 0.003, + "loss": 4.1939, + "step": 2828 + }, + { + "epoch": 0.02829, + "grad_norm": 0.4887994839877233, + "learning_rate": 0.003, + "loss": 4.2162, + "step": 2829 + }, + { + "epoch": 0.0283, + "grad_norm": 0.5659608141025803, + "learning_rate": 0.003, + "loss": 4.1838, + "step": 2830 + }, + { + "epoch": 0.02831, + "grad_norm": 0.6332182296297022, + "learning_rate": 0.003, + "loss": 4.1886, + "step": 2831 + }, + { + "epoch": 0.02832, + "grad_norm": 0.7157897208149597, + "learning_rate": 0.003, + "loss": 4.2049, + "step": 2832 + }, + { + "epoch": 0.02833, + "grad_norm": 0.7488210186844626, + "learning_rate": 0.003, + "loss": 4.1858, + "step": 2833 + }, + { + "epoch": 0.02834, + "grad_norm": 0.721287579139196, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 2834 + }, + { + "epoch": 0.02835, + "grad_norm": 0.7107105075536025, + "learning_rate": 0.003, + "loss": 4.1873, + "step": 2835 + }, + { + "epoch": 0.02836, + "grad_norm": 0.7041301874694449, + "learning_rate": 0.003, + "loss": 4.2059, + "step": 2836 + }, + { + "epoch": 0.02837, + "grad_norm": 0.7504397142936561, + "learning_rate": 0.003, + "loss": 4.182, + "step": 2837 + }, + { + "epoch": 0.02838, + "grad_norm": 0.7162503617742563, + "learning_rate": 0.003, + "loss": 4.1502, + "step": 2838 + }, + { + "epoch": 0.02839, + "grad_norm": 0.6629058755892886, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 2839 + }, + { + "epoch": 0.0284, + "grad_norm": 0.8045463625357235, + "learning_rate": 0.003, + "loss": 4.1743, + "step": 2840 + }, + { + "epoch": 0.02841, + "grad_norm": 0.8128195799944198, + "learning_rate": 0.003, + "loss": 4.2167, + "step": 2841 + }, + { + "epoch": 0.02842, + "grad_norm": 0.9021793870071328, + "learning_rate": 0.003, + "loss": 4.201, + "step": 2842 + }, + { + "epoch": 0.02843, + "grad_norm": 0.9569629998728558, + "learning_rate": 0.003, + "loss": 4.2417, + "step": 2843 + }, + { + "epoch": 0.02844, + "grad_norm": 0.9483564800535025, + "learning_rate": 0.003, + "loss": 4.2425, + "step": 2844 + }, + { + "epoch": 0.02845, + "grad_norm": 0.9913358728662397, + "learning_rate": 0.003, + "loss": 4.2312, + "step": 2845 + }, + { + "epoch": 0.02846, + "grad_norm": 1.063054241841927, + "learning_rate": 0.003, + "loss": 4.2341, + "step": 2846 + }, + { + "epoch": 0.02847, + "grad_norm": 1.0645706257254368, + "learning_rate": 0.003, + "loss": 4.2982, + "step": 2847 + }, + { + "epoch": 0.02848, + "grad_norm": 0.8386178601836739, + "learning_rate": 0.003, + "loss": 4.248, + "step": 2848 + }, + { + "epoch": 0.02849, + "grad_norm": 0.764216257611563, + "learning_rate": 0.003, + "loss": 4.2302, + "step": 2849 + }, + { + "epoch": 0.0285, + "grad_norm": 0.8296984070312703, + "learning_rate": 0.003, + "loss": 4.2406, + "step": 2850 + }, + { + "epoch": 0.02851, + "grad_norm": 0.7660626969686088, + "learning_rate": 0.003, + "loss": 4.2727, + "step": 2851 + }, + { + "epoch": 0.02852, + "grad_norm": 0.6699896951076655, + "learning_rate": 0.003, + "loss": 4.2432, + "step": 2852 + }, + { + "epoch": 0.02853, + "grad_norm": 0.6448384261298653, + "learning_rate": 0.003, + "loss": 4.2061, + "step": 2853 + }, + { + "epoch": 0.02854, + "grad_norm": 0.5738318430229328, + "learning_rate": 0.003, + "loss": 4.2452, + "step": 2854 + }, + { + "epoch": 0.02855, + "grad_norm": 0.5220513862086724, + "learning_rate": 0.003, + "loss": 4.238, + "step": 2855 + }, + { + "epoch": 0.02856, + "grad_norm": 0.487730279931273, + "learning_rate": 0.003, + "loss": 4.236, + "step": 2856 + }, + { + "epoch": 0.02857, + "grad_norm": 0.5431060757677301, + "learning_rate": 0.003, + "loss": 4.2166, + "step": 2857 + }, + { + "epoch": 0.02858, + "grad_norm": 0.5661595941123131, + "learning_rate": 0.003, + "loss": 4.204, + "step": 2858 + }, + { + "epoch": 0.02859, + "grad_norm": 0.4484066606331403, + "learning_rate": 0.003, + "loss": 4.2049, + "step": 2859 + }, + { + "epoch": 0.0286, + "grad_norm": 0.4075396606562088, + "learning_rate": 0.003, + "loss": 4.2145, + "step": 2860 + }, + { + "epoch": 0.02861, + "grad_norm": 0.38814897890981537, + "learning_rate": 0.003, + "loss": 4.1834, + "step": 2861 + }, + { + "epoch": 0.02862, + "grad_norm": 0.35763606720502106, + "learning_rate": 0.003, + "loss": 4.1992, + "step": 2862 + }, + { + "epoch": 0.02863, + "grad_norm": 0.33574165762339114, + "learning_rate": 0.003, + "loss": 4.2202, + "step": 2863 + }, + { + "epoch": 0.02864, + "grad_norm": 0.2931133637710789, + "learning_rate": 0.003, + "loss": 4.1911, + "step": 2864 + }, + { + "epoch": 0.02865, + "grad_norm": 0.2747295563994342, + "learning_rate": 0.003, + "loss": 4.2198, + "step": 2865 + }, + { + "epoch": 0.02866, + "grad_norm": 0.2878577462186846, + "learning_rate": 0.003, + "loss": 4.1997, + "step": 2866 + }, + { + "epoch": 0.02867, + "grad_norm": 0.30967589778912374, + "learning_rate": 0.003, + "loss": 4.1832, + "step": 2867 + }, + { + "epoch": 0.02868, + "grad_norm": 0.3317442746388715, + "learning_rate": 0.003, + "loss": 4.1921, + "step": 2868 + }, + { + "epoch": 0.02869, + "grad_norm": 0.41983761578290535, + "learning_rate": 0.003, + "loss": 4.1814, + "step": 2869 + }, + { + "epoch": 0.0287, + "grad_norm": 0.5221052091755417, + "learning_rate": 0.003, + "loss": 4.2048, + "step": 2870 + }, + { + "epoch": 0.02871, + "grad_norm": 0.6596505309045855, + "learning_rate": 0.003, + "loss": 4.1929, + "step": 2871 + }, + { + "epoch": 0.02872, + "grad_norm": 0.6939156668954594, + "learning_rate": 0.003, + "loss": 4.2319, + "step": 2872 + }, + { + "epoch": 0.02873, + "grad_norm": 0.5889408095632228, + "learning_rate": 0.003, + "loss": 4.1969, + "step": 2873 + }, + { + "epoch": 0.02874, + "grad_norm": 0.5945812975211832, + "learning_rate": 0.003, + "loss": 4.2106, + "step": 2874 + }, + { + "epoch": 0.02875, + "grad_norm": 0.5997681923067583, + "learning_rate": 0.003, + "loss": 4.2173, + "step": 2875 + }, + { + "epoch": 0.02876, + "grad_norm": 0.5432684898721573, + "learning_rate": 0.003, + "loss": 4.1947, + "step": 2876 + }, + { + "epoch": 0.02877, + "grad_norm": 0.5211950136895529, + "learning_rate": 0.003, + "loss": 4.1769, + "step": 2877 + }, + { + "epoch": 0.02878, + "grad_norm": 0.4787079571485456, + "learning_rate": 0.003, + "loss": 4.1717, + "step": 2878 + }, + { + "epoch": 0.02879, + "grad_norm": 0.45005415921350717, + "learning_rate": 0.003, + "loss": 4.1629, + "step": 2879 + }, + { + "epoch": 0.0288, + "grad_norm": 0.43431149375473005, + "learning_rate": 0.003, + "loss": 4.1688, + "step": 2880 + }, + { + "epoch": 0.02881, + "grad_norm": 0.4696763789342238, + "learning_rate": 0.003, + "loss": 4.1826, + "step": 2881 + }, + { + "epoch": 0.02882, + "grad_norm": 0.5425923911480585, + "learning_rate": 0.003, + "loss": 4.2011, + "step": 2882 + }, + { + "epoch": 0.02883, + "grad_norm": 0.4752679190892369, + "learning_rate": 0.003, + "loss": 4.1811, + "step": 2883 + }, + { + "epoch": 0.02884, + "grad_norm": 0.4357972258734796, + "learning_rate": 0.003, + "loss": 4.2167, + "step": 2884 + }, + { + "epoch": 0.02885, + "grad_norm": 0.42135010007795737, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 2885 + }, + { + "epoch": 0.02886, + "grad_norm": 0.4753116215584479, + "learning_rate": 0.003, + "loss": 4.1904, + "step": 2886 + }, + { + "epoch": 0.02887, + "grad_norm": 0.4764421561290465, + "learning_rate": 0.003, + "loss": 4.1753, + "step": 2887 + }, + { + "epoch": 0.02888, + "grad_norm": 0.4467467351995427, + "learning_rate": 0.003, + "loss": 4.1716, + "step": 2888 + }, + { + "epoch": 0.02889, + "grad_norm": 0.4870006343939259, + "learning_rate": 0.003, + "loss": 4.1625, + "step": 2889 + }, + { + "epoch": 0.0289, + "grad_norm": 0.4870936179251758, + "learning_rate": 0.003, + "loss": 4.1939, + "step": 2890 + }, + { + "epoch": 0.02891, + "grad_norm": 0.5804488605460525, + "learning_rate": 0.003, + "loss": 4.2122, + "step": 2891 + }, + { + "epoch": 0.02892, + "grad_norm": 0.6634466821898878, + "learning_rate": 0.003, + "loss": 4.17, + "step": 2892 + }, + { + "epoch": 0.02893, + "grad_norm": 0.5948043705095636, + "learning_rate": 0.003, + "loss": 4.1771, + "step": 2893 + }, + { + "epoch": 0.02894, + "grad_norm": 0.6043920015751632, + "learning_rate": 0.003, + "loss": 4.1843, + "step": 2894 + }, + { + "epoch": 0.02895, + "grad_norm": 0.6201059818349552, + "learning_rate": 0.003, + "loss": 4.2268, + "step": 2895 + }, + { + "epoch": 0.02896, + "grad_norm": 0.6991042225102296, + "learning_rate": 0.003, + "loss": 4.1857, + "step": 2896 + }, + { + "epoch": 0.02897, + "grad_norm": 0.7490225053396202, + "learning_rate": 0.003, + "loss": 4.1703, + "step": 2897 + }, + { + "epoch": 0.02898, + "grad_norm": 0.7349838612699012, + "learning_rate": 0.003, + "loss": 4.2192, + "step": 2898 + }, + { + "epoch": 0.02899, + "grad_norm": 0.7005801384832842, + "learning_rate": 0.003, + "loss": 4.2014, + "step": 2899 + }, + { + "epoch": 0.029, + "grad_norm": 0.6982394096022596, + "learning_rate": 0.003, + "loss": 4.1924, + "step": 2900 + }, + { + "epoch": 0.02901, + "grad_norm": 0.7179332370923166, + "learning_rate": 0.003, + "loss": 4.2092, + "step": 2901 + }, + { + "epoch": 0.02902, + "grad_norm": 0.6408206050082779, + "learning_rate": 0.003, + "loss": 4.2244, + "step": 2902 + }, + { + "epoch": 0.02903, + "grad_norm": 0.5794248721426231, + "learning_rate": 0.003, + "loss": 4.1817, + "step": 2903 + }, + { + "epoch": 0.02904, + "grad_norm": 0.6047919514195621, + "learning_rate": 0.003, + "loss": 4.1991, + "step": 2904 + }, + { + "epoch": 0.02905, + "grad_norm": 0.6712321741283004, + "learning_rate": 0.003, + "loss": 4.2179, + "step": 2905 + }, + { + "epoch": 0.02906, + "grad_norm": 0.7364218920572956, + "learning_rate": 0.003, + "loss": 4.2038, + "step": 2906 + }, + { + "epoch": 0.02907, + "grad_norm": 0.7800724882255463, + "learning_rate": 0.003, + "loss": 4.1811, + "step": 2907 + }, + { + "epoch": 0.02908, + "grad_norm": 0.7350720938021648, + "learning_rate": 0.003, + "loss": 4.2049, + "step": 2908 + }, + { + "epoch": 0.02909, + "grad_norm": 0.6300086850554201, + "learning_rate": 0.003, + "loss": 4.1942, + "step": 2909 + }, + { + "epoch": 0.0291, + "grad_norm": 0.5419084839810532, + "learning_rate": 0.003, + "loss": 4.203, + "step": 2910 + }, + { + "epoch": 0.02911, + "grad_norm": 0.544140250655527, + "learning_rate": 0.003, + "loss": 4.1974, + "step": 2911 + }, + { + "epoch": 0.02912, + "grad_norm": 0.5832074121091988, + "learning_rate": 0.003, + "loss": 4.1707, + "step": 2912 + }, + { + "epoch": 0.02913, + "grad_norm": 0.46996544227828957, + "learning_rate": 0.003, + "loss": 4.1972, + "step": 2913 + }, + { + "epoch": 0.02914, + "grad_norm": 0.5185065785536244, + "learning_rate": 0.003, + "loss": 4.1876, + "step": 2914 + }, + { + "epoch": 0.02915, + "grad_norm": 0.5161041915363022, + "learning_rate": 0.003, + "loss": 4.1792, + "step": 2915 + }, + { + "epoch": 0.02916, + "grad_norm": 0.5743632962294265, + "learning_rate": 0.003, + "loss": 4.1875, + "step": 2916 + }, + { + "epoch": 0.02917, + "grad_norm": 0.5787314482733064, + "learning_rate": 0.003, + "loss": 4.2184, + "step": 2917 + }, + { + "epoch": 0.02918, + "grad_norm": 0.6089620541454349, + "learning_rate": 0.003, + "loss": 4.1759, + "step": 2918 + }, + { + "epoch": 0.02919, + "grad_norm": 0.5982379349158571, + "learning_rate": 0.003, + "loss": 4.219, + "step": 2919 + }, + { + "epoch": 0.0292, + "grad_norm": 0.5540097529270788, + "learning_rate": 0.003, + "loss": 4.156, + "step": 2920 + }, + { + "epoch": 0.02921, + "grad_norm": 0.5487736094530016, + "learning_rate": 0.003, + "loss": 4.1965, + "step": 2921 + }, + { + "epoch": 0.02922, + "grad_norm": 0.46820953528164894, + "learning_rate": 0.003, + "loss": 4.1944, + "step": 2922 + }, + { + "epoch": 0.02923, + "grad_norm": 0.3911917353150244, + "learning_rate": 0.003, + "loss": 4.1613, + "step": 2923 + }, + { + "epoch": 0.02924, + "grad_norm": 0.3519948559374971, + "learning_rate": 0.003, + "loss": 4.1948, + "step": 2924 + }, + { + "epoch": 0.02925, + "grad_norm": 0.37937109757353865, + "learning_rate": 0.003, + "loss": 4.1979, + "step": 2925 + }, + { + "epoch": 0.02926, + "grad_norm": 0.4065842767583339, + "learning_rate": 0.003, + "loss": 4.1756, + "step": 2926 + }, + { + "epoch": 0.02927, + "grad_norm": 0.475304983845517, + "learning_rate": 0.003, + "loss": 4.1825, + "step": 2927 + }, + { + "epoch": 0.02928, + "grad_norm": 0.6734308844569649, + "learning_rate": 0.003, + "loss": 4.2095, + "step": 2928 + }, + { + "epoch": 0.02929, + "grad_norm": 0.8546919556478062, + "learning_rate": 0.003, + "loss": 4.2048, + "step": 2929 + }, + { + "epoch": 0.0293, + "grad_norm": 0.8348963936929714, + "learning_rate": 0.003, + "loss": 4.2035, + "step": 2930 + }, + { + "epoch": 0.02931, + "grad_norm": 0.7454052973981365, + "learning_rate": 0.003, + "loss": 4.2011, + "step": 2931 + }, + { + "epoch": 0.02932, + "grad_norm": 0.7467998755597431, + "learning_rate": 0.003, + "loss": 4.1986, + "step": 2932 + }, + { + "epoch": 0.02933, + "grad_norm": 0.8069014916887333, + "learning_rate": 0.003, + "loss": 4.2246, + "step": 2933 + }, + { + "epoch": 0.02934, + "grad_norm": 0.9491281331586824, + "learning_rate": 0.003, + "loss": 4.2274, + "step": 2934 + }, + { + "epoch": 0.02935, + "grad_norm": 0.9112675124388164, + "learning_rate": 0.003, + "loss": 4.222, + "step": 2935 + }, + { + "epoch": 0.02936, + "grad_norm": 0.8519647588347719, + "learning_rate": 0.003, + "loss": 4.2389, + "step": 2936 + }, + { + "epoch": 0.02937, + "grad_norm": 0.9555587969650123, + "learning_rate": 0.003, + "loss": 4.2301, + "step": 2937 + }, + { + "epoch": 0.02938, + "grad_norm": 0.9150311365393384, + "learning_rate": 0.003, + "loss": 4.2464, + "step": 2938 + }, + { + "epoch": 0.02939, + "grad_norm": 1.0737440013454487, + "learning_rate": 0.003, + "loss": 4.2526, + "step": 2939 + }, + { + "epoch": 0.0294, + "grad_norm": 1.0147261563008552, + "learning_rate": 0.003, + "loss": 4.2801, + "step": 2940 + }, + { + "epoch": 0.02941, + "grad_norm": 1.022661673211658, + "learning_rate": 0.003, + "loss": 4.2879, + "step": 2941 + }, + { + "epoch": 0.02942, + "grad_norm": 1.0444970801430784, + "learning_rate": 0.003, + "loss": 4.2748, + "step": 2942 + }, + { + "epoch": 0.02943, + "grad_norm": 0.8455394657906586, + "learning_rate": 0.003, + "loss": 4.2533, + "step": 2943 + }, + { + "epoch": 0.02944, + "grad_norm": 0.7628339588396233, + "learning_rate": 0.003, + "loss": 4.2739, + "step": 2944 + }, + { + "epoch": 0.02945, + "grad_norm": 0.9388418061065686, + "learning_rate": 0.003, + "loss": 4.2636, + "step": 2945 + }, + { + "epoch": 0.02946, + "grad_norm": 0.9857547654390597, + "learning_rate": 0.003, + "loss": 4.2709, + "step": 2946 + }, + { + "epoch": 0.02947, + "grad_norm": 0.9078404718745537, + "learning_rate": 0.003, + "loss": 4.301, + "step": 2947 + }, + { + "epoch": 0.02948, + "grad_norm": 0.9520758925981907, + "learning_rate": 0.003, + "loss": 4.2798, + "step": 2948 + }, + { + "epoch": 0.02949, + "grad_norm": 1.0406544574219045, + "learning_rate": 0.003, + "loss": 4.2581, + "step": 2949 + }, + { + "epoch": 0.0295, + "grad_norm": 0.9292248813352704, + "learning_rate": 0.003, + "loss": 4.2783, + "step": 2950 + }, + { + "epoch": 0.02951, + "grad_norm": 0.8590093083304559, + "learning_rate": 0.003, + "loss": 4.2285, + "step": 2951 + }, + { + "epoch": 0.02952, + "grad_norm": 0.6554290928742069, + "learning_rate": 0.003, + "loss": 4.2469, + "step": 2952 + }, + { + "epoch": 0.02953, + "grad_norm": 0.8266033050494169, + "learning_rate": 0.003, + "loss": 4.2603, + "step": 2953 + }, + { + "epoch": 0.02954, + "grad_norm": 0.9710367944205619, + "learning_rate": 0.003, + "loss": 4.3127, + "step": 2954 + }, + { + "epoch": 0.02955, + "grad_norm": 0.9091757097495087, + "learning_rate": 0.003, + "loss": 4.2852, + "step": 2955 + }, + { + "epoch": 0.02956, + "grad_norm": 0.7400776132762851, + "learning_rate": 0.003, + "loss": 4.2656, + "step": 2956 + }, + { + "epoch": 0.02957, + "grad_norm": 0.5166853898536266, + "learning_rate": 0.003, + "loss": 4.2638, + "step": 2957 + }, + { + "epoch": 0.02958, + "grad_norm": 0.49710477556743804, + "learning_rate": 0.003, + "loss": 4.276, + "step": 2958 + }, + { + "epoch": 0.02959, + "grad_norm": 0.4499644234512779, + "learning_rate": 0.003, + "loss": 4.2565, + "step": 2959 + }, + { + "epoch": 0.0296, + "grad_norm": 0.4150995513488321, + "learning_rate": 0.003, + "loss": 4.2574, + "step": 2960 + }, + { + "epoch": 0.02961, + "grad_norm": 0.4170220343004301, + "learning_rate": 0.003, + "loss": 4.2257, + "step": 2961 + }, + { + "epoch": 0.02962, + "grad_norm": 0.4582297929571511, + "learning_rate": 0.003, + "loss": 4.2309, + "step": 2962 + }, + { + "epoch": 0.02963, + "grad_norm": 0.4661209765546486, + "learning_rate": 0.003, + "loss": 4.2018, + "step": 2963 + }, + { + "epoch": 0.02964, + "grad_norm": 0.40893029477200127, + "learning_rate": 0.003, + "loss": 4.1831, + "step": 2964 + }, + { + "epoch": 0.02965, + "grad_norm": 0.30952309242863746, + "learning_rate": 0.003, + "loss": 4.2259, + "step": 2965 + }, + { + "epoch": 0.02966, + "grad_norm": 0.2925470804974261, + "learning_rate": 0.003, + "loss": 4.2331, + "step": 2966 + }, + { + "epoch": 0.02967, + "grad_norm": 0.30877812791584447, + "learning_rate": 0.003, + "loss": 4.2188, + "step": 2967 + }, + { + "epoch": 0.02968, + "grad_norm": 0.2982831000211414, + "learning_rate": 0.003, + "loss": 4.2128, + "step": 2968 + }, + { + "epoch": 0.02969, + "grad_norm": 0.31941995962945985, + "learning_rate": 0.003, + "loss": 4.207, + "step": 2969 + }, + { + "epoch": 0.0297, + "grad_norm": 0.4198384435174913, + "learning_rate": 0.003, + "loss": 4.1928, + "step": 2970 + }, + { + "epoch": 0.02971, + "grad_norm": 0.4895528117346824, + "learning_rate": 0.003, + "loss": 4.2113, + "step": 2971 + }, + { + "epoch": 0.02972, + "grad_norm": 0.46403453609324874, + "learning_rate": 0.003, + "loss": 4.1791, + "step": 2972 + }, + { + "epoch": 0.02973, + "grad_norm": 0.3996362985623369, + "learning_rate": 0.003, + "loss": 4.1612, + "step": 2973 + }, + { + "epoch": 0.02974, + "grad_norm": 0.3669370591415718, + "learning_rate": 0.003, + "loss": 4.197, + "step": 2974 + }, + { + "epoch": 0.02975, + "grad_norm": 0.3727347214399743, + "learning_rate": 0.003, + "loss": 4.1805, + "step": 2975 + }, + { + "epoch": 0.02976, + "grad_norm": 0.40978542004123153, + "learning_rate": 0.003, + "loss": 4.1911, + "step": 2976 + }, + { + "epoch": 0.02977, + "grad_norm": 0.4390067885032759, + "learning_rate": 0.003, + "loss": 4.178, + "step": 2977 + }, + { + "epoch": 0.02978, + "grad_norm": 0.38270215912996663, + "learning_rate": 0.003, + "loss": 4.1974, + "step": 2978 + }, + { + "epoch": 0.02979, + "grad_norm": 0.30944659309862094, + "learning_rate": 0.003, + "loss": 4.1765, + "step": 2979 + }, + { + "epoch": 0.0298, + "grad_norm": 0.3395231398632022, + "learning_rate": 0.003, + "loss": 4.181, + "step": 2980 + }, + { + "epoch": 0.02981, + "grad_norm": 0.34592707909470244, + "learning_rate": 0.003, + "loss": 4.1907, + "step": 2981 + }, + { + "epoch": 0.02982, + "grad_norm": 0.38035052902890665, + "learning_rate": 0.003, + "loss": 4.1864, + "step": 2982 + }, + { + "epoch": 0.02983, + "grad_norm": 0.5707754810875516, + "learning_rate": 0.003, + "loss": 4.2039, + "step": 2983 + }, + { + "epoch": 0.02984, + "grad_norm": 0.8429776036729895, + "learning_rate": 0.003, + "loss": 4.1934, + "step": 2984 + }, + { + "epoch": 0.02985, + "grad_norm": 0.9426647233611747, + "learning_rate": 0.003, + "loss": 4.2337, + "step": 2985 + }, + { + "epoch": 0.02986, + "grad_norm": 0.5829174432301835, + "learning_rate": 0.003, + "loss": 4.1829, + "step": 2986 + }, + { + "epoch": 0.02987, + "grad_norm": 0.5797297658595812, + "learning_rate": 0.003, + "loss": 4.1761, + "step": 2987 + }, + { + "epoch": 0.02988, + "grad_norm": 0.5667463117618501, + "learning_rate": 0.003, + "loss": 4.1884, + "step": 2988 + }, + { + "epoch": 0.02989, + "grad_norm": 0.46700979940535653, + "learning_rate": 0.003, + "loss": 4.196, + "step": 2989 + }, + { + "epoch": 0.0299, + "grad_norm": 0.47425843632840237, + "learning_rate": 0.003, + "loss": 4.1545, + "step": 2990 + }, + { + "epoch": 0.02991, + "grad_norm": 0.4585869479501072, + "learning_rate": 0.003, + "loss": 4.2064, + "step": 2991 + }, + { + "epoch": 0.02992, + "grad_norm": 0.44091690071775774, + "learning_rate": 0.003, + "loss": 4.2006, + "step": 2992 + }, + { + "epoch": 0.02993, + "grad_norm": 0.4427631334764928, + "learning_rate": 0.003, + "loss": 4.1703, + "step": 2993 + }, + { + "epoch": 0.02994, + "grad_norm": 0.3680333066458911, + "learning_rate": 0.003, + "loss": 4.1784, + "step": 2994 + }, + { + "epoch": 0.02995, + "grad_norm": 0.3313732790736943, + "learning_rate": 0.003, + "loss": 4.213, + "step": 2995 + }, + { + "epoch": 0.02996, + "grad_norm": 0.33322944772224145, + "learning_rate": 0.003, + "loss": 4.1998, + "step": 2996 + }, + { + "epoch": 0.02997, + "grad_norm": 0.35383706371010143, + "learning_rate": 0.003, + "loss": 4.1619, + "step": 2997 + }, + { + "epoch": 0.02998, + "grad_norm": 0.3144018568097704, + "learning_rate": 0.003, + "loss": 4.1605, + "step": 2998 + }, + { + "epoch": 0.02999, + "grad_norm": 0.2707448134611539, + "learning_rate": 0.003, + "loss": 4.1532, + "step": 2999 + }, + { + "epoch": 0.03, + "grad_norm": 0.31453822473769166, + "learning_rate": 0.003, + "loss": 4.1985, + "step": 3000 + }, + { + "epoch": 0.03001, + "grad_norm": 0.3575380630906315, + "learning_rate": 0.003, + "loss": 4.1657, + "step": 3001 + }, + { + "epoch": 0.03002, + "grad_norm": 0.37987123508738174, + "learning_rate": 0.003, + "loss": 4.1801, + "step": 3002 + }, + { + "epoch": 0.03003, + "grad_norm": 0.43883399019288905, + "learning_rate": 0.003, + "loss": 4.1596, + "step": 3003 + }, + { + "epoch": 0.03004, + "grad_norm": 0.4354037002724127, + "learning_rate": 0.003, + "loss": 4.1389, + "step": 3004 + }, + { + "epoch": 0.03005, + "grad_norm": 0.44090216643634306, + "learning_rate": 0.003, + "loss": 4.1901, + "step": 3005 + }, + { + "epoch": 0.03006, + "grad_norm": 0.4541062050925091, + "learning_rate": 0.003, + "loss": 4.1847, + "step": 3006 + }, + { + "epoch": 0.03007, + "grad_norm": 0.5413252745038201, + "learning_rate": 0.003, + "loss": 4.1589, + "step": 3007 + }, + { + "epoch": 0.03008, + "grad_norm": 0.5914463578547696, + "learning_rate": 0.003, + "loss": 4.1893, + "step": 3008 + }, + { + "epoch": 0.03009, + "grad_norm": 0.6057328540964105, + "learning_rate": 0.003, + "loss": 4.1629, + "step": 3009 + }, + { + "epoch": 0.0301, + "grad_norm": 0.6785032290357721, + "learning_rate": 0.003, + "loss": 4.1916, + "step": 3010 + }, + { + "epoch": 0.03011, + "grad_norm": 0.7337865546330415, + "learning_rate": 0.003, + "loss": 4.1764, + "step": 3011 + }, + { + "epoch": 0.03012, + "grad_norm": 0.7469304524271401, + "learning_rate": 0.003, + "loss": 4.1847, + "step": 3012 + }, + { + "epoch": 0.03013, + "grad_norm": 0.6647601453471846, + "learning_rate": 0.003, + "loss": 4.1443, + "step": 3013 + }, + { + "epoch": 0.03014, + "grad_norm": 0.6507126425794799, + "learning_rate": 0.003, + "loss": 4.1729, + "step": 3014 + }, + { + "epoch": 0.03015, + "grad_norm": 0.6380413738814859, + "learning_rate": 0.003, + "loss": 4.1745, + "step": 3015 + }, + { + "epoch": 0.03016, + "grad_norm": 0.6182015477535148, + "learning_rate": 0.003, + "loss": 4.1815, + "step": 3016 + }, + { + "epoch": 0.03017, + "grad_norm": 0.586737080148615, + "learning_rate": 0.003, + "loss": 4.1768, + "step": 3017 + }, + { + "epoch": 0.03018, + "grad_norm": 0.5605400072157327, + "learning_rate": 0.003, + "loss": 4.1704, + "step": 3018 + }, + { + "epoch": 0.03019, + "grad_norm": 0.5243331941511797, + "learning_rate": 0.003, + "loss": 4.1891, + "step": 3019 + }, + { + "epoch": 0.0302, + "grad_norm": 0.49926755386002525, + "learning_rate": 0.003, + "loss": 4.1784, + "step": 3020 + }, + { + "epoch": 0.03021, + "grad_norm": 0.5365217273161322, + "learning_rate": 0.003, + "loss": 4.2017, + "step": 3021 + }, + { + "epoch": 0.03022, + "grad_norm": 0.49962846601806954, + "learning_rate": 0.003, + "loss": 4.1994, + "step": 3022 + }, + { + "epoch": 0.03023, + "grad_norm": 0.44101225502614944, + "learning_rate": 0.003, + "loss": 4.2091, + "step": 3023 + }, + { + "epoch": 0.03024, + "grad_norm": 0.4478587533564886, + "learning_rate": 0.003, + "loss": 4.2023, + "step": 3024 + }, + { + "epoch": 0.03025, + "grad_norm": 0.36914153729322, + "learning_rate": 0.003, + "loss": 4.1809, + "step": 3025 + }, + { + "epoch": 0.03026, + "grad_norm": 0.4189775215567249, + "learning_rate": 0.003, + "loss": 4.1546, + "step": 3026 + }, + { + "epoch": 0.03027, + "grad_norm": 0.4812644816912879, + "learning_rate": 0.003, + "loss": 4.1781, + "step": 3027 + }, + { + "epoch": 0.03028, + "grad_norm": 0.5896483265711634, + "learning_rate": 0.003, + "loss": 4.1412, + "step": 3028 + }, + { + "epoch": 0.03029, + "grad_norm": 0.7727132136494914, + "learning_rate": 0.003, + "loss": 4.1946, + "step": 3029 + }, + { + "epoch": 0.0303, + "grad_norm": 0.8225314550231486, + "learning_rate": 0.003, + "loss": 4.184, + "step": 3030 + }, + { + "epoch": 0.03031, + "grad_norm": 0.6946644336533113, + "learning_rate": 0.003, + "loss": 4.1598, + "step": 3031 + }, + { + "epoch": 0.03032, + "grad_norm": 0.7204147056292812, + "learning_rate": 0.003, + "loss": 4.192, + "step": 3032 + }, + { + "epoch": 0.03033, + "grad_norm": 0.6929276241309884, + "learning_rate": 0.003, + "loss": 4.1876, + "step": 3033 + }, + { + "epoch": 0.03034, + "grad_norm": 0.6811610794872539, + "learning_rate": 0.003, + "loss": 4.1933, + "step": 3034 + }, + { + "epoch": 0.03035, + "grad_norm": 0.6796984084846434, + "learning_rate": 0.003, + "loss": 4.2232, + "step": 3035 + }, + { + "epoch": 0.03036, + "grad_norm": 0.6389991482634575, + "learning_rate": 0.003, + "loss": 4.1949, + "step": 3036 + }, + { + "epoch": 0.03037, + "grad_norm": 0.7107042944049744, + "learning_rate": 0.003, + "loss": 4.1819, + "step": 3037 + }, + { + "epoch": 0.03038, + "grad_norm": 0.7261414033877567, + "learning_rate": 0.003, + "loss": 4.1853, + "step": 3038 + }, + { + "epoch": 0.03039, + "grad_norm": 0.6927833025504285, + "learning_rate": 0.003, + "loss": 4.1894, + "step": 3039 + }, + { + "epoch": 0.0304, + "grad_norm": 0.6355129937975011, + "learning_rate": 0.003, + "loss": 4.1938, + "step": 3040 + }, + { + "epoch": 0.03041, + "grad_norm": 0.5305188998243404, + "learning_rate": 0.003, + "loss": 4.1953, + "step": 3041 + }, + { + "epoch": 0.03042, + "grad_norm": 0.5277486902643708, + "learning_rate": 0.003, + "loss": 4.2187, + "step": 3042 + }, + { + "epoch": 0.03043, + "grad_norm": 0.48072198596584337, + "learning_rate": 0.003, + "loss": 4.1691, + "step": 3043 + }, + { + "epoch": 0.03044, + "grad_norm": 0.5305619681037871, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 3044 + }, + { + "epoch": 0.03045, + "grad_norm": 0.6019249816686142, + "learning_rate": 0.003, + "loss": 4.1781, + "step": 3045 + }, + { + "epoch": 0.03046, + "grad_norm": 0.6501665014846743, + "learning_rate": 0.003, + "loss": 4.1924, + "step": 3046 + }, + { + "epoch": 0.03047, + "grad_norm": 0.6512554720210705, + "learning_rate": 0.003, + "loss": 4.1981, + "step": 3047 + }, + { + "epoch": 0.03048, + "grad_norm": 0.6687647551352641, + "learning_rate": 0.003, + "loss": 4.1799, + "step": 3048 + }, + { + "epoch": 0.03049, + "grad_norm": 0.6495592187211907, + "learning_rate": 0.003, + "loss": 4.1692, + "step": 3049 + }, + { + "epoch": 0.0305, + "grad_norm": 0.5742671707682592, + "learning_rate": 0.003, + "loss": 4.1811, + "step": 3050 + }, + { + "epoch": 0.03051, + "grad_norm": 0.5413644439530771, + "learning_rate": 0.003, + "loss": 4.1904, + "step": 3051 + }, + { + "epoch": 0.03052, + "grad_norm": 0.45841707001486987, + "learning_rate": 0.003, + "loss": 4.1921, + "step": 3052 + }, + { + "epoch": 0.03053, + "grad_norm": 0.41417961125519964, + "learning_rate": 0.003, + "loss": 4.193, + "step": 3053 + }, + { + "epoch": 0.03054, + "grad_norm": 0.43823010289384967, + "learning_rate": 0.003, + "loss": 4.1743, + "step": 3054 + }, + { + "epoch": 0.03055, + "grad_norm": 0.5090646995969303, + "learning_rate": 0.003, + "loss": 4.1764, + "step": 3055 + }, + { + "epoch": 0.03056, + "grad_norm": 0.5373409114504033, + "learning_rate": 0.003, + "loss": 4.1895, + "step": 3056 + }, + { + "epoch": 0.03057, + "grad_norm": 0.6080154895830046, + "learning_rate": 0.003, + "loss": 4.1889, + "step": 3057 + }, + { + "epoch": 0.03058, + "grad_norm": 0.6540368964023406, + "learning_rate": 0.003, + "loss": 4.1648, + "step": 3058 + }, + { + "epoch": 0.03059, + "grad_norm": 0.5788424826468096, + "learning_rate": 0.003, + "loss": 4.1748, + "step": 3059 + }, + { + "epoch": 0.0306, + "grad_norm": 0.4988555874907532, + "learning_rate": 0.003, + "loss": 4.1584, + "step": 3060 + }, + { + "epoch": 0.03061, + "grad_norm": 0.4878251974397899, + "learning_rate": 0.003, + "loss": 4.1841, + "step": 3061 + }, + { + "epoch": 0.03062, + "grad_norm": 0.5122227250314405, + "learning_rate": 0.003, + "loss": 4.2073, + "step": 3062 + }, + { + "epoch": 0.03063, + "grad_norm": 0.48008913969979206, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 3063 + }, + { + "epoch": 0.03064, + "grad_norm": 0.46426896895718006, + "learning_rate": 0.003, + "loss": 4.1818, + "step": 3064 + }, + { + "epoch": 0.03065, + "grad_norm": 0.4213766940672057, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 3065 + }, + { + "epoch": 0.03066, + "grad_norm": 0.3680402555018932, + "learning_rate": 0.003, + "loss": 4.1908, + "step": 3066 + }, + { + "epoch": 0.03067, + "grad_norm": 0.3709473284252339, + "learning_rate": 0.003, + "loss": 4.194, + "step": 3067 + }, + { + "epoch": 0.03068, + "grad_norm": 0.3892404694228725, + "learning_rate": 0.003, + "loss": 4.1654, + "step": 3068 + }, + { + "epoch": 0.03069, + "grad_norm": 0.421709346396654, + "learning_rate": 0.003, + "loss": 4.1505, + "step": 3069 + }, + { + "epoch": 0.0307, + "grad_norm": 0.452536722715443, + "learning_rate": 0.003, + "loss": 4.1628, + "step": 3070 + }, + { + "epoch": 0.03071, + "grad_norm": 0.43180254995245465, + "learning_rate": 0.003, + "loss": 4.1592, + "step": 3071 + }, + { + "epoch": 0.03072, + "grad_norm": 0.4894410650771712, + "learning_rate": 0.003, + "loss": 4.167, + "step": 3072 + }, + { + "epoch": 0.03073, + "grad_norm": 0.5403720255329184, + "learning_rate": 0.003, + "loss": 4.1662, + "step": 3073 + }, + { + "epoch": 0.03074, + "grad_norm": 0.5846698892510205, + "learning_rate": 0.003, + "loss": 4.1709, + "step": 3074 + }, + { + "epoch": 0.03075, + "grad_norm": 0.6574319904083324, + "learning_rate": 0.003, + "loss": 4.1977, + "step": 3075 + }, + { + "epoch": 0.03076, + "grad_norm": 0.6619917640445645, + "learning_rate": 0.003, + "loss": 4.1936, + "step": 3076 + }, + { + "epoch": 0.03077, + "grad_norm": 0.6043736206778582, + "learning_rate": 0.003, + "loss": 4.1867, + "step": 3077 + }, + { + "epoch": 0.03078, + "grad_norm": 0.6204154957439805, + "learning_rate": 0.003, + "loss": 4.1729, + "step": 3078 + }, + { + "epoch": 0.03079, + "grad_norm": 0.5593470356338103, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 3079 + }, + { + "epoch": 0.0308, + "grad_norm": 0.5726380712603409, + "learning_rate": 0.003, + "loss": 4.1884, + "step": 3080 + }, + { + "epoch": 0.03081, + "grad_norm": 0.6460072493113082, + "learning_rate": 0.003, + "loss": 4.1673, + "step": 3081 + }, + { + "epoch": 0.03082, + "grad_norm": 0.6213962184440697, + "learning_rate": 0.003, + "loss": 4.1696, + "step": 3082 + }, + { + "epoch": 0.03083, + "grad_norm": 0.7067435150796765, + "learning_rate": 0.003, + "loss": 4.1723, + "step": 3083 + }, + { + "epoch": 0.03084, + "grad_norm": 0.7686911194186954, + "learning_rate": 0.003, + "loss": 4.1952, + "step": 3084 + }, + { + "epoch": 0.03085, + "grad_norm": 0.7715187358287607, + "learning_rate": 0.003, + "loss": 4.193, + "step": 3085 + }, + { + "epoch": 0.03086, + "grad_norm": 0.8253824375938966, + "learning_rate": 0.003, + "loss": 4.2186, + "step": 3086 + }, + { + "epoch": 0.03087, + "grad_norm": 0.7572823291901524, + "learning_rate": 0.003, + "loss": 4.1941, + "step": 3087 + }, + { + "epoch": 0.03088, + "grad_norm": 0.6756116771892228, + "learning_rate": 0.003, + "loss": 4.209, + "step": 3088 + }, + { + "epoch": 0.03089, + "grad_norm": 0.6820329410441291, + "learning_rate": 0.003, + "loss": 4.1733, + "step": 3089 + }, + { + "epoch": 0.0309, + "grad_norm": 0.7401927831575114, + "learning_rate": 0.003, + "loss": 4.1803, + "step": 3090 + }, + { + "epoch": 0.03091, + "grad_norm": 0.6394165992316918, + "learning_rate": 0.003, + "loss": 4.1389, + "step": 3091 + }, + { + "epoch": 0.03092, + "grad_norm": 0.6572468374220727, + "learning_rate": 0.003, + "loss": 4.1725, + "step": 3092 + }, + { + "epoch": 0.03093, + "grad_norm": 0.6935160519017697, + "learning_rate": 0.003, + "loss": 4.204, + "step": 3093 + }, + { + "epoch": 0.03094, + "grad_norm": 0.6716436178909779, + "learning_rate": 0.003, + "loss": 4.188, + "step": 3094 + }, + { + "epoch": 0.03095, + "grad_norm": 0.6591966230230418, + "learning_rate": 0.003, + "loss": 4.203, + "step": 3095 + }, + { + "epoch": 0.03096, + "grad_norm": 0.5959190368353036, + "learning_rate": 0.003, + "loss": 4.1776, + "step": 3096 + }, + { + "epoch": 0.03097, + "grad_norm": 0.5866701878437197, + "learning_rate": 0.003, + "loss": 4.1994, + "step": 3097 + }, + { + "epoch": 0.03098, + "grad_norm": 0.6431690827463121, + "learning_rate": 0.003, + "loss": 4.1981, + "step": 3098 + }, + { + "epoch": 0.03099, + "grad_norm": 0.6249835235277311, + "learning_rate": 0.003, + "loss": 4.1902, + "step": 3099 + }, + { + "epoch": 0.031, + "grad_norm": 0.6337483657376243, + "learning_rate": 0.003, + "loss": 4.1855, + "step": 3100 + }, + { + "epoch": 0.03101, + "grad_norm": 0.6496496012737066, + "learning_rate": 0.003, + "loss": 4.1777, + "step": 3101 + }, + { + "epoch": 0.03102, + "grad_norm": 0.6229765708037603, + "learning_rate": 0.003, + "loss": 4.1786, + "step": 3102 + }, + { + "epoch": 0.03103, + "grad_norm": 0.6156795563561019, + "learning_rate": 0.003, + "loss": 4.1669, + "step": 3103 + }, + { + "epoch": 0.03104, + "grad_norm": 0.5787370390357155, + "learning_rate": 0.003, + "loss": 4.1717, + "step": 3104 + }, + { + "epoch": 0.03105, + "grad_norm": 0.46609179090900144, + "learning_rate": 0.003, + "loss": 4.1469, + "step": 3105 + }, + { + "epoch": 0.03106, + "grad_norm": 0.39336258398622215, + "learning_rate": 0.003, + "loss": 4.1631, + "step": 3106 + }, + { + "epoch": 0.03107, + "grad_norm": 0.4151072460907146, + "learning_rate": 0.003, + "loss": 4.1432, + "step": 3107 + }, + { + "epoch": 0.03108, + "grad_norm": 0.3556637596181332, + "learning_rate": 0.003, + "loss": 4.1522, + "step": 3108 + }, + { + "epoch": 0.03109, + "grad_norm": 0.3892938905312542, + "learning_rate": 0.003, + "loss": 4.159, + "step": 3109 + }, + { + "epoch": 0.0311, + "grad_norm": 0.4475819635965094, + "learning_rate": 0.003, + "loss": 4.1799, + "step": 3110 + }, + { + "epoch": 0.03111, + "grad_norm": 0.48864194527310445, + "learning_rate": 0.003, + "loss": 4.1578, + "step": 3111 + }, + { + "epoch": 0.03112, + "grad_norm": 0.5382631741272804, + "learning_rate": 0.003, + "loss": 4.1803, + "step": 3112 + }, + { + "epoch": 0.03113, + "grad_norm": 0.5134933039857796, + "learning_rate": 0.003, + "loss": 4.1686, + "step": 3113 + }, + { + "epoch": 0.03114, + "grad_norm": 0.45531644644558, + "learning_rate": 0.003, + "loss": 4.1695, + "step": 3114 + }, + { + "epoch": 0.03115, + "grad_norm": 0.409821580807442, + "learning_rate": 0.003, + "loss": 4.1707, + "step": 3115 + }, + { + "epoch": 0.03116, + "grad_norm": 0.40691251051535354, + "learning_rate": 0.003, + "loss": 4.1549, + "step": 3116 + }, + { + "epoch": 0.03117, + "grad_norm": 0.41357167905284664, + "learning_rate": 0.003, + "loss": 4.1631, + "step": 3117 + }, + { + "epoch": 0.03118, + "grad_norm": 0.4338761124077823, + "learning_rate": 0.003, + "loss": 4.1632, + "step": 3118 + }, + { + "epoch": 0.03119, + "grad_norm": 0.4549147699459294, + "learning_rate": 0.003, + "loss": 4.1792, + "step": 3119 + }, + { + "epoch": 0.0312, + "grad_norm": 0.5032285577701964, + "learning_rate": 0.003, + "loss": 4.1839, + "step": 3120 + }, + { + "epoch": 0.03121, + "grad_norm": 0.5982541074347542, + "learning_rate": 0.003, + "loss": 4.1639, + "step": 3121 + }, + { + "epoch": 0.03122, + "grad_norm": 0.7093524896825744, + "learning_rate": 0.003, + "loss": 4.1884, + "step": 3122 + }, + { + "epoch": 0.03123, + "grad_norm": 0.6935478647730015, + "learning_rate": 0.003, + "loss": 4.1891, + "step": 3123 + }, + { + "epoch": 0.03124, + "grad_norm": 0.6414330115408258, + "learning_rate": 0.003, + "loss": 4.1632, + "step": 3124 + }, + { + "epoch": 0.03125, + "grad_norm": 0.6053563143256356, + "learning_rate": 0.003, + "loss": 4.1675, + "step": 3125 + }, + { + "epoch": 0.03126, + "grad_norm": 0.5853887861604886, + "learning_rate": 0.003, + "loss": 4.1684, + "step": 3126 + }, + { + "epoch": 0.03127, + "grad_norm": 0.6449491393449573, + "learning_rate": 0.003, + "loss": 4.1802, + "step": 3127 + }, + { + "epoch": 0.03128, + "grad_norm": 0.6502029339707546, + "learning_rate": 0.003, + "loss": 4.1706, + "step": 3128 + }, + { + "epoch": 0.03129, + "grad_norm": 0.7355040262252988, + "learning_rate": 0.003, + "loss": 4.1916, + "step": 3129 + }, + { + "epoch": 0.0313, + "grad_norm": 0.7004678988862265, + "learning_rate": 0.003, + "loss": 4.1982, + "step": 3130 + }, + { + "epoch": 0.03131, + "grad_norm": 0.545773725439425, + "learning_rate": 0.003, + "loss": 4.1723, + "step": 3131 + }, + { + "epoch": 0.03132, + "grad_norm": 0.575890880552423, + "learning_rate": 0.003, + "loss": 4.1945, + "step": 3132 + }, + { + "epoch": 0.03133, + "grad_norm": 0.6235572169219774, + "learning_rate": 0.003, + "loss": 4.1876, + "step": 3133 + }, + { + "epoch": 0.03134, + "grad_norm": 0.6099749117272442, + "learning_rate": 0.003, + "loss": 4.1881, + "step": 3134 + }, + { + "epoch": 0.03135, + "grad_norm": 0.6941243018172416, + "learning_rate": 0.003, + "loss": 4.1573, + "step": 3135 + }, + { + "epoch": 0.03136, + "grad_norm": 0.70121888828651, + "learning_rate": 0.003, + "loss": 4.1772, + "step": 3136 + }, + { + "epoch": 0.03137, + "grad_norm": 0.7085003488749844, + "learning_rate": 0.003, + "loss": 4.1775, + "step": 3137 + }, + { + "epoch": 0.03138, + "grad_norm": 0.6535643891764703, + "learning_rate": 0.003, + "loss": 4.1786, + "step": 3138 + }, + { + "epoch": 0.03139, + "grad_norm": 0.6308297801361256, + "learning_rate": 0.003, + "loss": 4.1826, + "step": 3139 + }, + { + "epoch": 0.0314, + "grad_norm": 0.5309446473531373, + "learning_rate": 0.003, + "loss": 4.1768, + "step": 3140 + }, + { + "epoch": 0.03141, + "grad_norm": 0.5693804160477011, + "learning_rate": 0.003, + "loss": 4.2097, + "step": 3141 + }, + { + "epoch": 0.03142, + "grad_norm": 0.6211997698889902, + "learning_rate": 0.003, + "loss": 4.1899, + "step": 3142 + }, + { + "epoch": 0.03143, + "grad_norm": 0.7073668422617321, + "learning_rate": 0.003, + "loss": 4.1805, + "step": 3143 + }, + { + "epoch": 0.03144, + "grad_norm": 0.7851704146814469, + "learning_rate": 0.003, + "loss": 4.1599, + "step": 3144 + }, + { + "epoch": 0.03145, + "grad_norm": 0.7954348048069579, + "learning_rate": 0.003, + "loss": 4.1863, + "step": 3145 + }, + { + "epoch": 0.03146, + "grad_norm": 0.7034536386154087, + "learning_rate": 0.003, + "loss": 4.1929, + "step": 3146 + }, + { + "epoch": 0.03147, + "grad_norm": 0.6509274147125733, + "learning_rate": 0.003, + "loss": 4.1902, + "step": 3147 + }, + { + "epoch": 0.03148, + "grad_norm": 0.5546841496464855, + "learning_rate": 0.003, + "loss": 4.1877, + "step": 3148 + }, + { + "epoch": 0.03149, + "grad_norm": 0.46984021974514056, + "learning_rate": 0.003, + "loss": 4.1921, + "step": 3149 + }, + { + "epoch": 0.0315, + "grad_norm": 0.47872296043934637, + "learning_rate": 0.003, + "loss": 4.1773, + "step": 3150 + }, + { + "epoch": 0.03151, + "grad_norm": 0.4383095460015223, + "learning_rate": 0.003, + "loss": 4.1832, + "step": 3151 + }, + { + "epoch": 0.03152, + "grad_norm": 0.5147623540970385, + "learning_rate": 0.003, + "loss": 4.1863, + "step": 3152 + }, + { + "epoch": 0.03153, + "grad_norm": 0.5162583950949946, + "learning_rate": 0.003, + "loss": 4.1959, + "step": 3153 + }, + { + "epoch": 0.03154, + "grad_norm": 0.4461000862295419, + "learning_rate": 0.003, + "loss": 4.1831, + "step": 3154 + }, + { + "epoch": 0.03155, + "grad_norm": 0.4554669455184566, + "learning_rate": 0.003, + "loss": 4.1734, + "step": 3155 + }, + { + "epoch": 0.03156, + "grad_norm": 0.5334766080110984, + "learning_rate": 0.003, + "loss": 4.1606, + "step": 3156 + }, + { + "epoch": 0.03157, + "grad_norm": 0.5732423454584378, + "learning_rate": 0.003, + "loss": 4.187, + "step": 3157 + }, + { + "epoch": 0.03158, + "grad_norm": 0.4974411380590613, + "learning_rate": 0.003, + "loss": 4.1698, + "step": 3158 + }, + { + "epoch": 0.03159, + "grad_norm": 0.44203642501633, + "learning_rate": 0.003, + "loss": 4.1664, + "step": 3159 + }, + { + "epoch": 0.0316, + "grad_norm": 0.523140366094698, + "learning_rate": 0.003, + "loss": 4.1866, + "step": 3160 + }, + { + "epoch": 0.03161, + "grad_norm": 0.5162200569611098, + "learning_rate": 0.003, + "loss": 4.1933, + "step": 3161 + }, + { + "epoch": 0.03162, + "grad_norm": 0.5139469343607818, + "learning_rate": 0.003, + "loss": 4.197, + "step": 3162 + }, + { + "epoch": 0.03163, + "grad_norm": 0.4948430436476383, + "learning_rate": 0.003, + "loss": 4.1778, + "step": 3163 + }, + { + "epoch": 0.03164, + "grad_norm": 0.5669314442812998, + "learning_rate": 0.003, + "loss": 4.1763, + "step": 3164 + }, + { + "epoch": 0.03165, + "grad_norm": 0.6170132015572393, + "learning_rate": 0.003, + "loss": 4.151, + "step": 3165 + }, + { + "epoch": 0.03166, + "grad_norm": 0.7007311191133752, + "learning_rate": 0.003, + "loss": 4.1717, + "step": 3166 + }, + { + "epoch": 0.03167, + "grad_norm": 0.8595364667795417, + "learning_rate": 0.003, + "loss": 4.173, + "step": 3167 + }, + { + "epoch": 0.03168, + "grad_norm": 0.7655669847205984, + "learning_rate": 0.003, + "loss": 4.1955, + "step": 3168 + }, + { + "epoch": 0.03169, + "grad_norm": 0.6713498892330952, + "learning_rate": 0.003, + "loss": 4.2051, + "step": 3169 + }, + { + "epoch": 0.0317, + "grad_norm": 0.677153969130682, + "learning_rate": 0.003, + "loss": 4.1601, + "step": 3170 + }, + { + "epoch": 0.03171, + "grad_norm": 0.7390285282020239, + "learning_rate": 0.003, + "loss": 4.2052, + "step": 3171 + }, + { + "epoch": 0.03172, + "grad_norm": 0.7569313720684826, + "learning_rate": 0.003, + "loss": 4.1948, + "step": 3172 + }, + { + "epoch": 0.03173, + "grad_norm": 0.6724394145714953, + "learning_rate": 0.003, + "loss": 4.1656, + "step": 3173 + }, + { + "epoch": 0.03174, + "grad_norm": 0.6566446253256072, + "learning_rate": 0.003, + "loss": 4.1768, + "step": 3174 + }, + { + "epoch": 0.03175, + "grad_norm": 0.5308391109404651, + "learning_rate": 0.003, + "loss": 4.2005, + "step": 3175 + }, + { + "epoch": 0.03176, + "grad_norm": 0.522270282662199, + "learning_rate": 0.003, + "loss": 4.1629, + "step": 3176 + }, + { + "epoch": 0.03177, + "grad_norm": 0.5118875159250269, + "learning_rate": 0.003, + "loss": 4.1695, + "step": 3177 + }, + { + "epoch": 0.03178, + "grad_norm": 0.5975750285946342, + "learning_rate": 0.003, + "loss": 4.2169, + "step": 3178 + }, + { + "epoch": 0.03179, + "grad_norm": 0.6582127079564257, + "learning_rate": 0.003, + "loss": 4.1817, + "step": 3179 + }, + { + "epoch": 0.0318, + "grad_norm": 0.6638267847544939, + "learning_rate": 0.003, + "loss": 4.1847, + "step": 3180 + }, + { + "epoch": 0.03181, + "grad_norm": 0.633651026875217, + "learning_rate": 0.003, + "loss": 4.1946, + "step": 3181 + }, + { + "epoch": 0.03182, + "grad_norm": 0.5922002752582246, + "learning_rate": 0.003, + "loss": 4.1909, + "step": 3182 + }, + { + "epoch": 0.03183, + "grad_norm": 0.6660662970407278, + "learning_rate": 0.003, + "loss": 4.196, + "step": 3183 + }, + { + "epoch": 0.03184, + "grad_norm": 0.6465621990428836, + "learning_rate": 0.003, + "loss": 4.1683, + "step": 3184 + }, + { + "epoch": 0.03185, + "grad_norm": 0.6190012561941471, + "learning_rate": 0.003, + "loss": 4.1874, + "step": 3185 + }, + { + "epoch": 0.03186, + "grad_norm": 0.5745629525057602, + "learning_rate": 0.003, + "loss": 4.1843, + "step": 3186 + }, + { + "epoch": 0.03187, + "grad_norm": 0.596914271669763, + "learning_rate": 0.003, + "loss": 4.1467, + "step": 3187 + }, + { + "epoch": 0.03188, + "grad_norm": 0.5835579641541558, + "learning_rate": 0.003, + "loss": 4.1825, + "step": 3188 + }, + { + "epoch": 0.03189, + "grad_norm": 0.576531148114492, + "learning_rate": 0.003, + "loss": 4.1702, + "step": 3189 + }, + { + "epoch": 0.0319, + "grad_norm": 0.46857212343029936, + "learning_rate": 0.003, + "loss": 4.1619, + "step": 3190 + }, + { + "epoch": 0.03191, + "grad_norm": 0.44669804116834894, + "learning_rate": 0.003, + "loss": 4.1916, + "step": 3191 + }, + { + "epoch": 0.03192, + "grad_norm": 0.5116025817651049, + "learning_rate": 0.003, + "loss": 4.167, + "step": 3192 + }, + { + "epoch": 0.03193, + "grad_norm": 0.5017323740559085, + "learning_rate": 0.003, + "loss": 4.1666, + "step": 3193 + }, + { + "epoch": 0.03194, + "grad_norm": 0.5506388551758917, + "learning_rate": 0.003, + "loss": 4.1961, + "step": 3194 + }, + { + "epoch": 0.03195, + "grad_norm": 0.6905257500061265, + "learning_rate": 0.003, + "loss": 4.1672, + "step": 3195 + }, + { + "epoch": 0.03196, + "grad_norm": 0.6814664188108994, + "learning_rate": 0.003, + "loss": 4.18, + "step": 3196 + }, + { + "epoch": 0.03197, + "grad_norm": 0.6734616316609701, + "learning_rate": 0.003, + "loss": 4.15, + "step": 3197 + }, + { + "epoch": 0.03198, + "grad_norm": 0.5599435801670639, + "learning_rate": 0.003, + "loss": 4.1795, + "step": 3198 + }, + { + "epoch": 0.03199, + "grad_norm": 0.4771733892435305, + "learning_rate": 0.003, + "loss": 4.15, + "step": 3199 + }, + { + "epoch": 0.032, + "grad_norm": 0.4956367507190788, + "learning_rate": 0.003, + "loss": 4.172, + "step": 3200 + }, + { + "epoch": 0.03201, + "grad_norm": 0.40451540956897725, + "learning_rate": 0.003, + "loss": 4.1302, + "step": 3201 + }, + { + "epoch": 0.03202, + "grad_norm": 0.42753873923321506, + "learning_rate": 0.003, + "loss": 4.1703, + "step": 3202 + }, + { + "epoch": 0.03203, + "grad_norm": 0.45457758457355213, + "learning_rate": 0.003, + "loss": 4.1545, + "step": 3203 + }, + { + "epoch": 0.03204, + "grad_norm": 0.49270456285505987, + "learning_rate": 0.003, + "loss": 4.1737, + "step": 3204 + }, + { + "epoch": 0.03205, + "grad_norm": 0.551907056420435, + "learning_rate": 0.003, + "loss": 4.1698, + "step": 3205 + }, + { + "epoch": 0.03206, + "grad_norm": 0.5848754633539133, + "learning_rate": 0.003, + "loss": 4.1654, + "step": 3206 + }, + { + "epoch": 0.03207, + "grad_norm": 0.5646598219152835, + "learning_rate": 0.003, + "loss": 4.1449, + "step": 3207 + }, + { + "epoch": 0.03208, + "grad_norm": 0.5057453481084084, + "learning_rate": 0.003, + "loss": 4.1662, + "step": 3208 + }, + { + "epoch": 0.03209, + "grad_norm": 0.4868038489807391, + "learning_rate": 0.003, + "loss": 4.2026, + "step": 3209 + }, + { + "epoch": 0.0321, + "grad_norm": 0.5084192511453576, + "learning_rate": 0.003, + "loss": 4.1839, + "step": 3210 + }, + { + "epoch": 0.03211, + "grad_norm": 0.536979773481941, + "learning_rate": 0.003, + "loss": 4.1566, + "step": 3211 + }, + { + "epoch": 0.03212, + "grad_norm": 0.6453864413806403, + "learning_rate": 0.003, + "loss": 4.171, + "step": 3212 + }, + { + "epoch": 0.03213, + "grad_norm": 0.6979695060374274, + "learning_rate": 0.003, + "loss": 4.1767, + "step": 3213 + }, + { + "epoch": 0.03214, + "grad_norm": 0.8362949946620015, + "learning_rate": 0.003, + "loss": 4.1512, + "step": 3214 + }, + { + "epoch": 0.03215, + "grad_norm": 0.9000343317810777, + "learning_rate": 0.003, + "loss": 4.1727, + "step": 3215 + }, + { + "epoch": 0.03216, + "grad_norm": 0.8252513255224121, + "learning_rate": 0.003, + "loss": 4.1822, + "step": 3216 + }, + { + "epoch": 0.03217, + "grad_norm": 0.7553474851929083, + "learning_rate": 0.003, + "loss": 4.18, + "step": 3217 + }, + { + "epoch": 0.03218, + "grad_norm": 0.7184034075019531, + "learning_rate": 0.003, + "loss": 4.1747, + "step": 3218 + }, + { + "epoch": 0.03219, + "grad_norm": 0.7559505824598467, + "learning_rate": 0.003, + "loss": 4.1839, + "step": 3219 + }, + { + "epoch": 0.0322, + "grad_norm": 0.7462679166089925, + "learning_rate": 0.003, + "loss": 4.1959, + "step": 3220 + }, + { + "epoch": 0.03221, + "grad_norm": 0.8114417269245792, + "learning_rate": 0.003, + "loss": 4.2066, + "step": 3221 + }, + { + "epoch": 0.03222, + "grad_norm": 0.7398684360618538, + "learning_rate": 0.003, + "loss": 4.1975, + "step": 3222 + }, + { + "epoch": 0.03223, + "grad_norm": 0.6559376119246889, + "learning_rate": 0.003, + "loss": 4.1971, + "step": 3223 + }, + { + "epoch": 0.03224, + "grad_norm": 0.6017029656710154, + "learning_rate": 0.003, + "loss": 4.1559, + "step": 3224 + }, + { + "epoch": 0.03225, + "grad_norm": 0.5653674381326333, + "learning_rate": 0.003, + "loss": 4.1866, + "step": 3225 + }, + { + "epoch": 0.03226, + "grad_norm": 0.6502250085881689, + "learning_rate": 0.003, + "loss": 4.1724, + "step": 3226 + }, + { + "epoch": 0.03227, + "grad_norm": 0.7143712957982395, + "learning_rate": 0.003, + "loss": 4.1994, + "step": 3227 + }, + { + "epoch": 0.03228, + "grad_norm": 0.6315781743986074, + "learning_rate": 0.003, + "loss": 4.1692, + "step": 3228 + }, + { + "epoch": 0.03229, + "grad_norm": 0.5567058811508958, + "learning_rate": 0.003, + "loss": 4.1858, + "step": 3229 + }, + { + "epoch": 0.0323, + "grad_norm": 0.5544929832938678, + "learning_rate": 0.003, + "loss": 4.1766, + "step": 3230 + }, + { + "epoch": 0.03231, + "grad_norm": 0.470332875801311, + "learning_rate": 0.003, + "loss": 4.1883, + "step": 3231 + }, + { + "epoch": 0.03232, + "grad_norm": 0.4362924227260804, + "learning_rate": 0.003, + "loss": 4.1748, + "step": 3232 + }, + { + "epoch": 0.03233, + "grad_norm": 0.4029479081021775, + "learning_rate": 0.003, + "loss": 4.1893, + "step": 3233 + }, + { + "epoch": 0.03234, + "grad_norm": 0.48479141962478917, + "learning_rate": 0.003, + "loss": 4.1724, + "step": 3234 + }, + { + "epoch": 0.03235, + "grad_norm": 0.5441326810507732, + "learning_rate": 0.003, + "loss": 4.1834, + "step": 3235 + }, + { + "epoch": 0.03236, + "grad_norm": 0.5291856539541236, + "learning_rate": 0.003, + "loss": 4.1742, + "step": 3236 + }, + { + "epoch": 0.03237, + "grad_norm": 0.4961166691014795, + "learning_rate": 0.003, + "loss": 4.1904, + "step": 3237 + }, + { + "epoch": 0.03238, + "grad_norm": 0.4503193502587542, + "learning_rate": 0.003, + "loss": 4.1937, + "step": 3238 + }, + { + "epoch": 0.03239, + "grad_norm": 0.49034979349671226, + "learning_rate": 0.003, + "loss": 4.176, + "step": 3239 + }, + { + "epoch": 0.0324, + "grad_norm": 0.5016168112544078, + "learning_rate": 0.003, + "loss": 4.1905, + "step": 3240 + }, + { + "epoch": 0.03241, + "grad_norm": 0.576118532422288, + "learning_rate": 0.003, + "loss": 4.1878, + "step": 3241 + }, + { + "epoch": 0.03242, + "grad_norm": 0.6819472138134928, + "learning_rate": 0.003, + "loss": 4.165, + "step": 3242 + }, + { + "epoch": 0.03243, + "grad_norm": 0.7729342734665511, + "learning_rate": 0.003, + "loss": 4.1588, + "step": 3243 + }, + { + "epoch": 0.03244, + "grad_norm": 0.8596141569373975, + "learning_rate": 0.003, + "loss": 4.1873, + "step": 3244 + }, + { + "epoch": 0.03245, + "grad_norm": 0.8099112239109375, + "learning_rate": 0.003, + "loss": 4.2019, + "step": 3245 + }, + { + "epoch": 0.03246, + "grad_norm": 0.6033775070976766, + "learning_rate": 0.003, + "loss": 4.1827, + "step": 3246 + }, + { + "epoch": 0.03247, + "grad_norm": 0.7122383502056996, + "learning_rate": 0.003, + "loss": 4.1951, + "step": 3247 + }, + { + "epoch": 0.03248, + "grad_norm": 0.6134714767115921, + "learning_rate": 0.003, + "loss": 4.2117, + "step": 3248 + }, + { + "epoch": 0.03249, + "grad_norm": 0.551686621394429, + "learning_rate": 0.003, + "loss": 4.172, + "step": 3249 + }, + { + "epoch": 0.0325, + "grad_norm": 0.4692280069519903, + "learning_rate": 0.003, + "loss": 4.2016, + "step": 3250 + }, + { + "epoch": 0.03251, + "grad_norm": 0.4762141737958913, + "learning_rate": 0.003, + "loss": 4.1968, + "step": 3251 + }, + { + "epoch": 0.03252, + "grad_norm": 0.451291493723249, + "learning_rate": 0.003, + "loss": 4.1609, + "step": 3252 + }, + { + "epoch": 0.03253, + "grad_norm": 0.5006035466184977, + "learning_rate": 0.003, + "loss": 4.1658, + "step": 3253 + }, + { + "epoch": 0.03254, + "grad_norm": 0.49103882046130487, + "learning_rate": 0.003, + "loss": 4.1589, + "step": 3254 + }, + { + "epoch": 0.03255, + "grad_norm": 0.528505966654426, + "learning_rate": 0.003, + "loss": 4.1726, + "step": 3255 + }, + { + "epoch": 0.03256, + "grad_norm": 0.5411019927206225, + "learning_rate": 0.003, + "loss": 4.1715, + "step": 3256 + }, + { + "epoch": 0.03257, + "grad_norm": 0.5426466711031169, + "learning_rate": 0.003, + "loss": 4.1571, + "step": 3257 + }, + { + "epoch": 0.03258, + "grad_norm": 0.5458703600164347, + "learning_rate": 0.003, + "loss": 4.1709, + "step": 3258 + }, + { + "epoch": 0.03259, + "grad_norm": 0.5203040924859128, + "learning_rate": 0.003, + "loss": 4.1699, + "step": 3259 + }, + { + "epoch": 0.0326, + "grad_norm": 0.45480155011690926, + "learning_rate": 0.003, + "loss": 4.1662, + "step": 3260 + }, + { + "epoch": 0.03261, + "grad_norm": 0.4731935212642663, + "learning_rate": 0.003, + "loss": 4.1413, + "step": 3261 + }, + { + "epoch": 0.03262, + "grad_norm": 0.47522534323877436, + "learning_rate": 0.003, + "loss": 4.1669, + "step": 3262 + }, + { + "epoch": 0.03263, + "grad_norm": 0.4999694797882516, + "learning_rate": 0.003, + "loss": 4.1772, + "step": 3263 + }, + { + "epoch": 0.03264, + "grad_norm": 0.5217567082705248, + "learning_rate": 0.003, + "loss": 4.189, + "step": 3264 + }, + { + "epoch": 0.03265, + "grad_norm": 0.6014802197349467, + "learning_rate": 0.003, + "loss": 4.187, + "step": 3265 + }, + { + "epoch": 0.03266, + "grad_norm": 0.6262382617606659, + "learning_rate": 0.003, + "loss": 4.1419, + "step": 3266 + }, + { + "epoch": 0.03267, + "grad_norm": 0.6366837634406238, + "learning_rate": 0.003, + "loss": 4.1524, + "step": 3267 + }, + { + "epoch": 0.03268, + "grad_norm": 0.724266364773543, + "learning_rate": 0.003, + "loss": 4.1497, + "step": 3268 + }, + { + "epoch": 0.03269, + "grad_norm": 0.7761710629109877, + "learning_rate": 0.003, + "loss": 4.1789, + "step": 3269 + }, + { + "epoch": 0.0327, + "grad_norm": 0.7383390055481723, + "learning_rate": 0.003, + "loss": 4.2091, + "step": 3270 + }, + { + "epoch": 0.03271, + "grad_norm": 0.6168450072471018, + "learning_rate": 0.003, + "loss": 4.1678, + "step": 3271 + }, + { + "epoch": 0.03272, + "grad_norm": 0.5388626714250451, + "learning_rate": 0.003, + "loss": 4.1712, + "step": 3272 + }, + { + "epoch": 0.03273, + "grad_norm": 0.541266940860741, + "learning_rate": 0.003, + "loss": 4.1713, + "step": 3273 + }, + { + "epoch": 0.03274, + "grad_norm": 0.5293709638699655, + "learning_rate": 0.003, + "loss": 4.1897, + "step": 3274 + }, + { + "epoch": 0.03275, + "grad_norm": 0.45689882391191533, + "learning_rate": 0.003, + "loss": 4.1828, + "step": 3275 + }, + { + "epoch": 0.03276, + "grad_norm": 0.341646145545343, + "learning_rate": 0.003, + "loss": 4.1585, + "step": 3276 + }, + { + "epoch": 0.03277, + "grad_norm": 0.36838541029861377, + "learning_rate": 0.003, + "loss": 4.1576, + "step": 3277 + }, + { + "epoch": 0.03278, + "grad_norm": 0.3636023442028596, + "learning_rate": 0.003, + "loss": 4.1669, + "step": 3278 + }, + { + "epoch": 0.03279, + "grad_norm": 0.3985620455372761, + "learning_rate": 0.003, + "loss": 4.1545, + "step": 3279 + }, + { + "epoch": 0.0328, + "grad_norm": 0.4543675760898354, + "learning_rate": 0.003, + "loss": 4.1411, + "step": 3280 + }, + { + "epoch": 0.03281, + "grad_norm": 0.5083262289907728, + "learning_rate": 0.003, + "loss": 4.1897, + "step": 3281 + }, + { + "epoch": 0.03282, + "grad_norm": 0.6315851302649976, + "learning_rate": 0.003, + "loss": 4.1492, + "step": 3282 + }, + { + "epoch": 0.03283, + "grad_norm": 0.7322734724447769, + "learning_rate": 0.003, + "loss": 4.1521, + "step": 3283 + }, + { + "epoch": 0.03284, + "grad_norm": 0.7647225152469821, + "learning_rate": 0.003, + "loss": 4.1977, + "step": 3284 + }, + { + "epoch": 0.03285, + "grad_norm": 0.8483759241793429, + "learning_rate": 0.003, + "loss": 4.1483, + "step": 3285 + }, + { + "epoch": 0.03286, + "grad_norm": 0.7999560566423449, + "learning_rate": 0.003, + "loss": 4.1904, + "step": 3286 + }, + { + "epoch": 0.03287, + "grad_norm": 0.71589535526088, + "learning_rate": 0.003, + "loss": 4.196, + "step": 3287 + }, + { + "epoch": 0.03288, + "grad_norm": 0.6336111011473581, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 3288 + }, + { + "epoch": 0.03289, + "grad_norm": 0.6641631109672658, + "learning_rate": 0.003, + "loss": 4.1775, + "step": 3289 + }, + { + "epoch": 0.0329, + "grad_norm": 0.6647898720593706, + "learning_rate": 0.003, + "loss": 4.2094, + "step": 3290 + }, + { + "epoch": 0.03291, + "grad_norm": 0.5807864922821838, + "learning_rate": 0.003, + "loss": 4.1898, + "step": 3291 + }, + { + "epoch": 0.03292, + "grad_norm": 0.515754721038097, + "learning_rate": 0.003, + "loss": 4.1509, + "step": 3292 + }, + { + "epoch": 0.03293, + "grad_norm": 0.5498053265889223, + "learning_rate": 0.003, + "loss": 4.1693, + "step": 3293 + }, + { + "epoch": 0.03294, + "grad_norm": 0.6841987985815203, + "learning_rate": 0.003, + "loss": 4.1825, + "step": 3294 + }, + { + "epoch": 0.03295, + "grad_norm": 0.6093729695331228, + "learning_rate": 0.003, + "loss": 4.1702, + "step": 3295 + }, + { + "epoch": 0.03296, + "grad_norm": 0.6089661662715524, + "learning_rate": 0.003, + "loss": 4.1627, + "step": 3296 + }, + { + "epoch": 0.03297, + "grad_norm": 0.5614137495318692, + "learning_rate": 0.003, + "loss": 4.1715, + "step": 3297 + }, + { + "epoch": 0.03298, + "grad_norm": 0.5134366817986185, + "learning_rate": 0.003, + "loss": 4.1886, + "step": 3298 + }, + { + "epoch": 0.03299, + "grad_norm": 0.549963907837565, + "learning_rate": 0.003, + "loss": 4.1739, + "step": 3299 + }, + { + "epoch": 0.033, + "grad_norm": 0.6519931831327299, + "learning_rate": 0.003, + "loss": 4.1675, + "step": 3300 + }, + { + "epoch": 0.03301, + "grad_norm": 0.7183972305593801, + "learning_rate": 0.003, + "loss": 4.2021, + "step": 3301 + }, + { + "epoch": 0.03302, + "grad_norm": 0.7018171786354589, + "learning_rate": 0.003, + "loss": 4.1793, + "step": 3302 + }, + { + "epoch": 0.03303, + "grad_norm": 0.5921722024443354, + "learning_rate": 0.003, + "loss": 4.1793, + "step": 3303 + }, + { + "epoch": 0.03304, + "grad_norm": 0.5309946357381335, + "learning_rate": 0.003, + "loss": 4.1726, + "step": 3304 + }, + { + "epoch": 0.03305, + "grad_norm": 0.5654757181942384, + "learning_rate": 0.003, + "loss": 4.1781, + "step": 3305 + }, + { + "epoch": 0.03306, + "grad_norm": 0.5651614262192222, + "learning_rate": 0.003, + "loss": 4.194, + "step": 3306 + }, + { + "epoch": 0.03307, + "grad_norm": 0.6220742834040969, + "learning_rate": 0.003, + "loss": 4.1933, + "step": 3307 + }, + { + "epoch": 0.03308, + "grad_norm": 0.6857411580561741, + "learning_rate": 0.003, + "loss": 4.1774, + "step": 3308 + }, + { + "epoch": 0.03309, + "grad_norm": 0.7647611362606778, + "learning_rate": 0.003, + "loss": 4.168, + "step": 3309 + }, + { + "epoch": 0.0331, + "grad_norm": 0.9027911762824596, + "learning_rate": 0.003, + "loss": 4.1838, + "step": 3310 + }, + { + "epoch": 0.03311, + "grad_norm": 0.9232745325192504, + "learning_rate": 0.003, + "loss": 4.2092, + "step": 3311 + }, + { + "epoch": 0.03312, + "grad_norm": 1.0033276702218215, + "learning_rate": 0.003, + "loss": 4.1863, + "step": 3312 + }, + { + "epoch": 0.03313, + "grad_norm": 0.8616181406278286, + "learning_rate": 0.003, + "loss": 4.217, + "step": 3313 + }, + { + "epoch": 0.03314, + "grad_norm": 0.6538312191871826, + "learning_rate": 0.003, + "loss": 4.1482, + "step": 3314 + }, + { + "epoch": 0.03315, + "grad_norm": 0.6513293592499171, + "learning_rate": 0.003, + "loss": 4.2011, + "step": 3315 + }, + { + "epoch": 0.03316, + "grad_norm": 0.5753636988224785, + "learning_rate": 0.003, + "loss": 4.1986, + "step": 3316 + }, + { + "epoch": 0.03317, + "grad_norm": 0.5660504612340471, + "learning_rate": 0.003, + "loss": 4.1897, + "step": 3317 + }, + { + "epoch": 0.03318, + "grad_norm": 0.5321945122016645, + "learning_rate": 0.003, + "loss": 4.1661, + "step": 3318 + }, + { + "epoch": 0.03319, + "grad_norm": 0.5341973320019645, + "learning_rate": 0.003, + "loss": 4.1598, + "step": 3319 + }, + { + "epoch": 0.0332, + "grad_norm": 0.47286473477321905, + "learning_rate": 0.003, + "loss": 4.1852, + "step": 3320 + }, + { + "epoch": 0.03321, + "grad_norm": 0.5050275992577558, + "learning_rate": 0.003, + "loss": 4.1965, + "step": 3321 + }, + { + "epoch": 0.03322, + "grad_norm": 0.4844023535662593, + "learning_rate": 0.003, + "loss": 4.1482, + "step": 3322 + }, + { + "epoch": 0.03323, + "grad_norm": 0.43309930004470687, + "learning_rate": 0.003, + "loss": 4.1893, + "step": 3323 + }, + { + "epoch": 0.03324, + "grad_norm": 0.44890262885350884, + "learning_rate": 0.003, + "loss": 4.1546, + "step": 3324 + }, + { + "epoch": 0.03325, + "grad_norm": 0.5095521797271855, + "learning_rate": 0.003, + "loss": 4.1917, + "step": 3325 + }, + { + "epoch": 0.03326, + "grad_norm": 0.477471322433437, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 3326 + }, + { + "epoch": 0.03327, + "grad_norm": 0.4315337339422698, + "learning_rate": 0.003, + "loss": 4.1751, + "step": 3327 + }, + { + "epoch": 0.03328, + "grad_norm": 0.4171492646524173, + "learning_rate": 0.003, + "loss": 4.1912, + "step": 3328 + }, + { + "epoch": 0.03329, + "grad_norm": 0.4506599360723401, + "learning_rate": 0.003, + "loss": 4.149, + "step": 3329 + }, + { + "epoch": 0.0333, + "grad_norm": 0.43373534373844225, + "learning_rate": 0.003, + "loss": 4.1533, + "step": 3330 + }, + { + "epoch": 0.03331, + "grad_norm": 0.48072192590135343, + "learning_rate": 0.003, + "loss": 4.1525, + "step": 3331 + }, + { + "epoch": 0.03332, + "grad_norm": 0.5603763883718295, + "learning_rate": 0.003, + "loss": 4.1515, + "step": 3332 + }, + { + "epoch": 0.03333, + "grad_norm": 0.5773402592299904, + "learning_rate": 0.003, + "loss": 4.1398, + "step": 3333 + }, + { + "epoch": 0.03334, + "grad_norm": 0.5925239668699488, + "learning_rate": 0.003, + "loss": 4.1717, + "step": 3334 + }, + { + "epoch": 0.03335, + "grad_norm": 0.5265113459144353, + "learning_rate": 0.003, + "loss": 4.1765, + "step": 3335 + }, + { + "epoch": 0.03336, + "grad_norm": 0.4418521183409816, + "learning_rate": 0.003, + "loss": 4.1732, + "step": 3336 + }, + { + "epoch": 0.03337, + "grad_norm": 0.42952166536927455, + "learning_rate": 0.003, + "loss": 4.1516, + "step": 3337 + }, + { + "epoch": 0.03338, + "grad_norm": 0.43519066754716446, + "learning_rate": 0.003, + "loss": 4.1603, + "step": 3338 + }, + { + "epoch": 0.03339, + "grad_norm": 0.4658156069803623, + "learning_rate": 0.003, + "loss": 4.1638, + "step": 3339 + }, + { + "epoch": 0.0334, + "grad_norm": 0.5171946739861909, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 3340 + }, + { + "epoch": 0.03341, + "grad_norm": 0.6356266827382321, + "learning_rate": 0.003, + "loss": 4.1677, + "step": 3341 + }, + { + "epoch": 0.03342, + "grad_norm": 0.6867092162549009, + "learning_rate": 0.003, + "loss": 4.1583, + "step": 3342 + }, + { + "epoch": 0.03343, + "grad_norm": 0.704031363814926, + "learning_rate": 0.003, + "loss": 4.1361, + "step": 3343 + }, + { + "epoch": 0.03344, + "grad_norm": 0.7513440673993107, + "learning_rate": 0.003, + "loss": 4.1949, + "step": 3344 + }, + { + "epoch": 0.03345, + "grad_norm": 0.7519677795539157, + "learning_rate": 0.003, + "loss": 4.1731, + "step": 3345 + }, + { + "epoch": 0.03346, + "grad_norm": 0.7713219880690217, + "learning_rate": 0.003, + "loss": 4.19, + "step": 3346 + }, + { + "epoch": 0.03347, + "grad_norm": 0.8480035915924508, + "learning_rate": 0.003, + "loss": 4.1797, + "step": 3347 + }, + { + "epoch": 0.03348, + "grad_norm": 0.8325415221214432, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 3348 + }, + { + "epoch": 0.03349, + "grad_norm": 0.7621143367252534, + "learning_rate": 0.003, + "loss": 4.2199, + "step": 3349 + }, + { + "epoch": 0.0335, + "grad_norm": 0.7234514614978355, + "learning_rate": 0.003, + "loss": 4.1749, + "step": 3350 + }, + { + "epoch": 0.03351, + "grad_norm": 0.6715941237384234, + "learning_rate": 0.003, + "loss": 4.1825, + "step": 3351 + }, + { + "epoch": 0.03352, + "grad_norm": 0.6771948560655807, + "learning_rate": 0.003, + "loss": 4.1972, + "step": 3352 + }, + { + "epoch": 0.03353, + "grad_norm": 0.7374767561640508, + "learning_rate": 0.003, + "loss": 4.1997, + "step": 3353 + }, + { + "epoch": 0.03354, + "grad_norm": 0.7924221564216216, + "learning_rate": 0.003, + "loss": 4.1685, + "step": 3354 + }, + { + "epoch": 0.03355, + "grad_norm": 0.752343695322846, + "learning_rate": 0.003, + "loss": 4.1919, + "step": 3355 + }, + { + "epoch": 0.03356, + "grad_norm": 0.6955515760269133, + "learning_rate": 0.003, + "loss": 4.2144, + "step": 3356 + }, + { + "epoch": 0.03357, + "grad_norm": 0.6439226598770458, + "learning_rate": 0.003, + "loss": 4.1843, + "step": 3357 + }, + { + "epoch": 0.03358, + "grad_norm": 0.6072963552767238, + "learning_rate": 0.003, + "loss": 4.2011, + "step": 3358 + }, + { + "epoch": 0.03359, + "grad_norm": 0.5544317009226006, + "learning_rate": 0.003, + "loss": 4.1868, + "step": 3359 + }, + { + "epoch": 0.0336, + "grad_norm": 0.7584252977946636, + "learning_rate": 0.003, + "loss": 4.2055, + "step": 3360 + }, + { + "epoch": 0.03361, + "grad_norm": 0.8675540717955526, + "learning_rate": 0.003, + "loss": 4.2091, + "step": 3361 + }, + { + "epoch": 0.03362, + "grad_norm": 0.8109217912617241, + "learning_rate": 0.003, + "loss": 4.2092, + "step": 3362 + }, + { + "epoch": 0.03363, + "grad_norm": 0.7666749182841046, + "learning_rate": 0.003, + "loss": 4.1611, + "step": 3363 + }, + { + "epoch": 0.03364, + "grad_norm": 0.7823149796193858, + "learning_rate": 0.003, + "loss": 4.1828, + "step": 3364 + }, + { + "epoch": 0.03365, + "grad_norm": 0.6387430725521948, + "learning_rate": 0.003, + "loss": 4.1715, + "step": 3365 + }, + { + "epoch": 0.03366, + "grad_norm": 0.5594833522786266, + "learning_rate": 0.003, + "loss": 4.1436, + "step": 3366 + }, + { + "epoch": 0.03367, + "grad_norm": 0.5181383097238287, + "learning_rate": 0.003, + "loss": 4.1575, + "step": 3367 + }, + { + "epoch": 0.03368, + "grad_norm": 0.5760572623008274, + "learning_rate": 0.003, + "loss": 4.207, + "step": 3368 + }, + { + "epoch": 0.03369, + "grad_norm": 0.550972005009095, + "learning_rate": 0.003, + "loss": 4.165, + "step": 3369 + }, + { + "epoch": 0.0337, + "grad_norm": 0.5261333264386098, + "learning_rate": 0.003, + "loss": 4.1995, + "step": 3370 + }, + { + "epoch": 0.03371, + "grad_norm": 0.4162241207425965, + "learning_rate": 0.003, + "loss": 4.1281, + "step": 3371 + }, + { + "epoch": 0.03372, + "grad_norm": 0.3964364873259744, + "learning_rate": 0.003, + "loss": 4.1565, + "step": 3372 + }, + { + "epoch": 0.03373, + "grad_norm": 0.39921807899478956, + "learning_rate": 0.003, + "loss": 4.1886, + "step": 3373 + }, + { + "epoch": 0.03374, + "grad_norm": 0.40722470978586633, + "learning_rate": 0.003, + "loss": 4.1693, + "step": 3374 + }, + { + "epoch": 0.03375, + "grad_norm": 0.4174362735293826, + "learning_rate": 0.003, + "loss": 4.1541, + "step": 3375 + }, + { + "epoch": 0.03376, + "grad_norm": 0.4670589348527475, + "learning_rate": 0.003, + "loss": 4.133, + "step": 3376 + }, + { + "epoch": 0.03377, + "grad_norm": 0.5539342573100582, + "learning_rate": 0.003, + "loss": 4.1786, + "step": 3377 + }, + { + "epoch": 0.03378, + "grad_norm": 0.6187862572290908, + "learning_rate": 0.003, + "loss": 4.1833, + "step": 3378 + }, + { + "epoch": 0.03379, + "grad_norm": 0.6015622211172064, + "learning_rate": 0.003, + "loss": 4.1567, + "step": 3379 + }, + { + "epoch": 0.0338, + "grad_norm": 0.4615401428395281, + "learning_rate": 0.003, + "loss": 4.1799, + "step": 3380 + }, + { + "epoch": 0.03381, + "grad_norm": 0.32933941833228836, + "learning_rate": 0.003, + "loss": 4.1786, + "step": 3381 + }, + { + "epoch": 0.03382, + "grad_norm": 0.3788346338425552, + "learning_rate": 0.003, + "loss": 4.1673, + "step": 3382 + }, + { + "epoch": 0.03383, + "grad_norm": 0.376989210662061, + "learning_rate": 0.003, + "loss": 4.13, + "step": 3383 + }, + { + "epoch": 0.03384, + "grad_norm": 0.34387151748219785, + "learning_rate": 0.003, + "loss": 4.1531, + "step": 3384 + }, + { + "epoch": 0.03385, + "grad_norm": 0.3352611282783692, + "learning_rate": 0.003, + "loss": 4.1564, + "step": 3385 + }, + { + "epoch": 0.03386, + "grad_norm": 0.33892990368837295, + "learning_rate": 0.003, + "loss": 4.1511, + "step": 3386 + }, + { + "epoch": 0.03387, + "grad_norm": 0.37206580118259613, + "learning_rate": 0.003, + "loss": 4.1538, + "step": 3387 + }, + { + "epoch": 0.03388, + "grad_norm": 0.41506855452840574, + "learning_rate": 0.003, + "loss": 4.1663, + "step": 3388 + }, + { + "epoch": 0.03389, + "grad_norm": 0.42405220029053703, + "learning_rate": 0.003, + "loss": 4.1835, + "step": 3389 + }, + { + "epoch": 0.0339, + "grad_norm": 0.4090594827115757, + "learning_rate": 0.003, + "loss": 4.1696, + "step": 3390 + }, + { + "epoch": 0.03391, + "grad_norm": 0.4342402082004512, + "learning_rate": 0.003, + "loss": 4.148, + "step": 3391 + }, + { + "epoch": 0.03392, + "grad_norm": 0.49470807894733526, + "learning_rate": 0.003, + "loss": 4.1568, + "step": 3392 + }, + { + "epoch": 0.03393, + "grad_norm": 0.7113520954651117, + "learning_rate": 0.003, + "loss": 4.1741, + "step": 3393 + }, + { + "epoch": 0.03394, + "grad_norm": 0.8695013133927905, + "learning_rate": 0.003, + "loss": 4.1852, + "step": 3394 + }, + { + "epoch": 0.03395, + "grad_norm": 0.937839460650371, + "learning_rate": 0.003, + "loss": 4.1994, + "step": 3395 + }, + { + "epoch": 0.03396, + "grad_norm": 0.7904694460773314, + "learning_rate": 0.003, + "loss": 4.1798, + "step": 3396 + }, + { + "epoch": 0.03397, + "grad_norm": 0.7235872457578897, + "learning_rate": 0.003, + "loss": 4.1456, + "step": 3397 + }, + { + "epoch": 0.03398, + "grad_norm": 0.8062806717697898, + "learning_rate": 0.003, + "loss": 4.2062, + "step": 3398 + }, + { + "epoch": 0.03399, + "grad_norm": 0.7158225027891656, + "learning_rate": 0.003, + "loss": 4.2222, + "step": 3399 + }, + { + "epoch": 0.034, + "grad_norm": 0.6123275074220181, + "learning_rate": 0.003, + "loss": 4.1995, + "step": 3400 + }, + { + "epoch": 0.03401, + "grad_norm": 0.6599152366758141, + "learning_rate": 0.003, + "loss": 4.1767, + "step": 3401 + }, + { + "epoch": 0.03402, + "grad_norm": 0.6314070611427443, + "learning_rate": 0.003, + "loss": 4.1971, + "step": 3402 + }, + { + "epoch": 0.03403, + "grad_norm": 0.6571294664516991, + "learning_rate": 0.003, + "loss": 4.1648, + "step": 3403 + }, + { + "epoch": 0.03404, + "grad_norm": 0.6404238631170781, + "learning_rate": 0.003, + "loss": 4.2011, + "step": 3404 + }, + { + "epoch": 0.03405, + "grad_norm": 0.6321854819314461, + "learning_rate": 0.003, + "loss": 4.1828, + "step": 3405 + }, + { + "epoch": 0.03406, + "grad_norm": 0.6941079886283862, + "learning_rate": 0.003, + "loss": 4.1694, + "step": 3406 + }, + { + "epoch": 0.03407, + "grad_norm": 0.8136592990040448, + "learning_rate": 0.003, + "loss": 4.1931, + "step": 3407 + }, + { + "epoch": 0.03408, + "grad_norm": 0.8054752836427478, + "learning_rate": 0.003, + "loss": 4.1777, + "step": 3408 + }, + { + "epoch": 0.03409, + "grad_norm": 0.7438300271168681, + "learning_rate": 0.003, + "loss": 4.1836, + "step": 3409 + }, + { + "epoch": 0.0341, + "grad_norm": 0.7769660820773376, + "learning_rate": 0.003, + "loss": 4.1811, + "step": 3410 + }, + { + "epoch": 0.03411, + "grad_norm": 0.7387445989360225, + "learning_rate": 0.003, + "loss": 4.1964, + "step": 3411 + }, + { + "epoch": 0.03412, + "grad_norm": 0.7543102885862639, + "learning_rate": 0.003, + "loss": 4.2165, + "step": 3412 + }, + { + "epoch": 0.03413, + "grad_norm": 0.7858533091946368, + "learning_rate": 0.003, + "loss": 4.2279, + "step": 3413 + }, + { + "epoch": 0.03414, + "grad_norm": 0.8085223407654301, + "learning_rate": 0.003, + "loss": 4.2197, + "step": 3414 + }, + { + "epoch": 0.03415, + "grad_norm": 0.7204513262789403, + "learning_rate": 0.003, + "loss": 4.1982, + "step": 3415 + }, + { + "epoch": 0.03416, + "grad_norm": 0.669578263783182, + "learning_rate": 0.003, + "loss": 4.2009, + "step": 3416 + }, + { + "epoch": 0.03417, + "grad_norm": 0.595729901049528, + "learning_rate": 0.003, + "loss": 4.1874, + "step": 3417 + }, + { + "epoch": 0.03418, + "grad_norm": 0.6302485601669457, + "learning_rate": 0.003, + "loss": 4.1603, + "step": 3418 + }, + { + "epoch": 0.03419, + "grad_norm": 0.48681001127779644, + "learning_rate": 0.003, + "loss": 4.1774, + "step": 3419 + }, + { + "epoch": 0.0342, + "grad_norm": 0.4381251825205999, + "learning_rate": 0.003, + "loss": 4.1848, + "step": 3420 + }, + { + "epoch": 0.03421, + "grad_norm": 0.40842368527410894, + "learning_rate": 0.003, + "loss": 4.1733, + "step": 3421 + }, + { + "epoch": 0.03422, + "grad_norm": 0.4158377016736793, + "learning_rate": 0.003, + "loss": 4.1888, + "step": 3422 + }, + { + "epoch": 0.03423, + "grad_norm": 0.38251097705951487, + "learning_rate": 0.003, + "loss": 4.1647, + "step": 3423 + }, + { + "epoch": 0.03424, + "grad_norm": 0.4199738680628049, + "learning_rate": 0.003, + "loss": 4.1889, + "step": 3424 + }, + { + "epoch": 0.03425, + "grad_norm": 0.5152529015696985, + "learning_rate": 0.003, + "loss": 4.1554, + "step": 3425 + }, + { + "epoch": 0.03426, + "grad_norm": 0.5309877993913092, + "learning_rate": 0.003, + "loss": 4.1773, + "step": 3426 + }, + { + "epoch": 0.03427, + "grad_norm": 0.4899224228978265, + "learning_rate": 0.003, + "loss": 4.1445, + "step": 3427 + }, + { + "epoch": 0.03428, + "grad_norm": 0.5031074901419099, + "learning_rate": 0.003, + "loss": 4.1439, + "step": 3428 + }, + { + "epoch": 0.03429, + "grad_norm": 0.6278449502427672, + "learning_rate": 0.003, + "loss": 4.1652, + "step": 3429 + }, + { + "epoch": 0.0343, + "grad_norm": 0.6739968965302988, + "learning_rate": 0.003, + "loss": 4.1611, + "step": 3430 + }, + { + "epoch": 0.03431, + "grad_norm": 0.7060406786523291, + "learning_rate": 0.003, + "loss": 4.1589, + "step": 3431 + }, + { + "epoch": 0.03432, + "grad_norm": 0.6678436460415758, + "learning_rate": 0.003, + "loss": 4.1677, + "step": 3432 + }, + { + "epoch": 0.03433, + "grad_norm": 0.47826597565957213, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 3433 + }, + { + "epoch": 0.03434, + "grad_norm": 0.4648969929741348, + "learning_rate": 0.003, + "loss": 4.1545, + "step": 3434 + }, + { + "epoch": 0.03435, + "grad_norm": 0.46695854146561, + "learning_rate": 0.003, + "loss": 4.1381, + "step": 3435 + }, + { + "epoch": 0.03436, + "grad_norm": 0.5162770863422669, + "learning_rate": 0.003, + "loss": 4.1665, + "step": 3436 + }, + { + "epoch": 0.03437, + "grad_norm": 0.48303509013107304, + "learning_rate": 0.003, + "loss": 4.1541, + "step": 3437 + }, + { + "epoch": 0.03438, + "grad_norm": 0.5052348764127375, + "learning_rate": 0.003, + "loss": 4.1693, + "step": 3438 + }, + { + "epoch": 0.03439, + "grad_norm": 0.4517303897020918, + "learning_rate": 0.003, + "loss": 4.1673, + "step": 3439 + }, + { + "epoch": 0.0344, + "grad_norm": 0.44296664325342977, + "learning_rate": 0.003, + "loss": 4.14, + "step": 3440 + }, + { + "epoch": 0.03441, + "grad_norm": 0.4450842770576517, + "learning_rate": 0.003, + "loss": 4.1618, + "step": 3441 + }, + { + "epoch": 0.03442, + "grad_norm": 0.4941470142821772, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 3442 + }, + { + "epoch": 0.03443, + "grad_norm": 0.5954542724862985, + "learning_rate": 0.003, + "loss": 4.1632, + "step": 3443 + }, + { + "epoch": 0.03444, + "grad_norm": 0.679140132757601, + "learning_rate": 0.003, + "loss": 4.1786, + "step": 3444 + }, + { + "epoch": 0.03445, + "grad_norm": 0.6296700934818499, + "learning_rate": 0.003, + "loss": 4.1412, + "step": 3445 + }, + { + "epoch": 0.03446, + "grad_norm": 0.5424802868176756, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 3446 + }, + { + "epoch": 0.03447, + "grad_norm": 0.4237052026562954, + "learning_rate": 0.003, + "loss": 4.1749, + "step": 3447 + }, + { + "epoch": 0.03448, + "grad_norm": 0.49663616285480033, + "learning_rate": 0.003, + "loss": 4.1465, + "step": 3448 + }, + { + "epoch": 0.03449, + "grad_norm": 0.4965879325162881, + "learning_rate": 0.003, + "loss": 4.1342, + "step": 3449 + }, + { + "epoch": 0.0345, + "grad_norm": 0.4699418192975183, + "learning_rate": 0.003, + "loss": 4.1679, + "step": 3450 + }, + { + "epoch": 0.03451, + "grad_norm": 0.4342808651184125, + "learning_rate": 0.003, + "loss": 4.1406, + "step": 3451 + }, + { + "epoch": 0.03452, + "grad_norm": 0.48981095252176143, + "learning_rate": 0.003, + "loss": 4.1702, + "step": 3452 + }, + { + "epoch": 0.03453, + "grad_norm": 0.5221855555116031, + "learning_rate": 0.003, + "loss": 4.1474, + "step": 3453 + }, + { + "epoch": 0.03454, + "grad_norm": 0.5252422198675372, + "learning_rate": 0.003, + "loss": 4.1308, + "step": 3454 + }, + { + "epoch": 0.03455, + "grad_norm": 0.5714349317836669, + "learning_rate": 0.003, + "loss": 4.1349, + "step": 3455 + }, + { + "epoch": 0.03456, + "grad_norm": 0.5686518541913316, + "learning_rate": 0.003, + "loss": 4.154, + "step": 3456 + }, + { + "epoch": 0.03457, + "grad_norm": 0.5777235599897509, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 3457 + }, + { + "epoch": 0.03458, + "grad_norm": 0.6023496924245035, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 3458 + }, + { + "epoch": 0.03459, + "grad_norm": 0.6835034129659396, + "learning_rate": 0.003, + "loss": 4.1883, + "step": 3459 + }, + { + "epoch": 0.0346, + "grad_norm": 0.7616829391144937, + "learning_rate": 0.003, + "loss": 4.195, + "step": 3460 + }, + { + "epoch": 0.03461, + "grad_norm": 0.6951133371872186, + "learning_rate": 0.003, + "loss": 4.1716, + "step": 3461 + }, + { + "epoch": 0.03462, + "grad_norm": 0.6533341057192256, + "learning_rate": 0.003, + "loss": 4.1653, + "step": 3462 + }, + { + "epoch": 0.03463, + "grad_norm": 0.7614181817822457, + "learning_rate": 0.003, + "loss": 4.1547, + "step": 3463 + }, + { + "epoch": 0.03464, + "grad_norm": 0.8154255816699014, + "learning_rate": 0.003, + "loss": 4.1769, + "step": 3464 + }, + { + "epoch": 0.03465, + "grad_norm": 0.834367328138272, + "learning_rate": 0.003, + "loss": 4.1879, + "step": 3465 + }, + { + "epoch": 0.03466, + "grad_norm": 0.7054161963486509, + "learning_rate": 0.003, + "loss": 4.1834, + "step": 3466 + }, + { + "epoch": 0.03467, + "grad_norm": 0.6747042802423374, + "learning_rate": 0.003, + "loss": 4.1681, + "step": 3467 + }, + { + "epoch": 0.03468, + "grad_norm": 0.6274506871293699, + "learning_rate": 0.003, + "loss": 4.1789, + "step": 3468 + }, + { + "epoch": 0.03469, + "grad_norm": 0.574247040112387, + "learning_rate": 0.003, + "loss": 4.1751, + "step": 3469 + }, + { + "epoch": 0.0347, + "grad_norm": 0.5474693168535483, + "learning_rate": 0.003, + "loss": 4.1868, + "step": 3470 + }, + { + "epoch": 0.03471, + "grad_norm": 0.5484056382186252, + "learning_rate": 0.003, + "loss": 4.1743, + "step": 3471 + }, + { + "epoch": 0.03472, + "grad_norm": 0.548625120805519, + "learning_rate": 0.003, + "loss": 4.1705, + "step": 3472 + }, + { + "epoch": 0.03473, + "grad_norm": 0.637401868814706, + "learning_rate": 0.003, + "loss": 4.1702, + "step": 3473 + }, + { + "epoch": 0.03474, + "grad_norm": 0.7144883444351541, + "learning_rate": 0.003, + "loss": 4.2012, + "step": 3474 + }, + { + "epoch": 0.03475, + "grad_norm": 0.7979670963387884, + "learning_rate": 0.003, + "loss": 4.1787, + "step": 3475 + }, + { + "epoch": 0.03476, + "grad_norm": 0.7163973548344581, + "learning_rate": 0.003, + "loss": 4.1814, + "step": 3476 + }, + { + "epoch": 0.03477, + "grad_norm": 0.5684898388546081, + "learning_rate": 0.003, + "loss": 4.1719, + "step": 3477 + }, + { + "epoch": 0.03478, + "grad_norm": 0.5861592780845318, + "learning_rate": 0.003, + "loss": 4.1823, + "step": 3478 + }, + { + "epoch": 0.03479, + "grad_norm": 0.5340943622573318, + "learning_rate": 0.003, + "loss": 4.1533, + "step": 3479 + }, + { + "epoch": 0.0348, + "grad_norm": 0.4404622712681107, + "learning_rate": 0.003, + "loss": 4.1503, + "step": 3480 + }, + { + "epoch": 0.03481, + "grad_norm": 0.4792980762796971, + "learning_rate": 0.003, + "loss": 4.1659, + "step": 3481 + }, + { + "epoch": 0.03482, + "grad_norm": 0.4579076307590615, + "learning_rate": 0.003, + "loss": 4.173, + "step": 3482 + }, + { + "epoch": 0.03483, + "grad_norm": 0.42854750510723455, + "learning_rate": 0.003, + "loss": 4.1486, + "step": 3483 + }, + { + "epoch": 0.03484, + "grad_norm": 0.4222950210388672, + "learning_rate": 0.003, + "loss": 4.1912, + "step": 3484 + }, + { + "epoch": 0.03485, + "grad_norm": 0.47787917501254906, + "learning_rate": 0.003, + "loss": 4.1842, + "step": 3485 + }, + { + "epoch": 0.03486, + "grad_norm": 0.6218177691290887, + "learning_rate": 0.003, + "loss": 4.1608, + "step": 3486 + }, + { + "epoch": 0.03487, + "grad_norm": 0.7695122506406105, + "learning_rate": 0.003, + "loss": 4.16, + "step": 3487 + }, + { + "epoch": 0.03488, + "grad_norm": 0.811403718151084, + "learning_rate": 0.003, + "loss": 4.1602, + "step": 3488 + }, + { + "epoch": 0.03489, + "grad_norm": 0.6478567488540471, + "learning_rate": 0.003, + "loss": 4.1728, + "step": 3489 + }, + { + "epoch": 0.0349, + "grad_norm": 0.5349166330706258, + "learning_rate": 0.003, + "loss": 4.1647, + "step": 3490 + }, + { + "epoch": 0.03491, + "grad_norm": 0.5539220367541061, + "learning_rate": 0.003, + "loss": 4.1823, + "step": 3491 + }, + { + "epoch": 0.03492, + "grad_norm": 0.4809491803196432, + "learning_rate": 0.003, + "loss": 4.1646, + "step": 3492 + }, + { + "epoch": 0.03493, + "grad_norm": 0.4980699721500476, + "learning_rate": 0.003, + "loss": 4.1561, + "step": 3493 + }, + { + "epoch": 0.03494, + "grad_norm": 0.6241704569078306, + "learning_rate": 0.003, + "loss": 4.161, + "step": 3494 + }, + { + "epoch": 0.03495, + "grad_norm": 0.6045742174777644, + "learning_rate": 0.003, + "loss": 4.1695, + "step": 3495 + }, + { + "epoch": 0.03496, + "grad_norm": 0.601044244149248, + "learning_rate": 0.003, + "loss": 4.1431, + "step": 3496 + }, + { + "epoch": 0.03497, + "grad_norm": 0.5400771740929299, + "learning_rate": 0.003, + "loss": 4.1771, + "step": 3497 + }, + { + "epoch": 0.03498, + "grad_norm": 0.713136808858786, + "learning_rate": 0.003, + "loss": 4.157, + "step": 3498 + }, + { + "epoch": 0.03499, + "grad_norm": 0.7906121138775885, + "learning_rate": 0.003, + "loss": 4.1705, + "step": 3499 + }, + { + "epoch": 0.035, + "grad_norm": 0.7463261361411888, + "learning_rate": 0.003, + "loss": 4.1867, + "step": 3500 + }, + { + "epoch": 0.03501, + "grad_norm": 0.6401328140167164, + "learning_rate": 0.003, + "loss": 4.1792, + "step": 3501 + }, + { + "epoch": 0.03502, + "grad_norm": 0.5849954396740931, + "learning_rate": 0.003, + "loss": 4.1761, + "step": 3502 + }, + { + "epoch": 0.03503, + "grad_norm": 0.5132547849361668, + "learning_rate": 0.003, + "loss": 4.1879, + "step": 3503 + }, + { + "epoch": 0.03504, + "grad_norm": 0.5053019189532232, + "learning_rate": 0.003, + "loss": 4.159, + "step": 3504 + }, + { + "epoch": 0.03505, + "grad_norm": 0.5412370735339649, + "learning_rate": 0.003, + "loss": 4.177, + "step": 3505 + }, + { + "epoch": 0.03506, + "grad_norm": 0.5174741400250557, + "learning_rate": 0.003, + "loss": 4.1855, + "step": 3506 + }, + { + "epoch": 0.03507, + "grad_norm": 0.5828929341462858, + "learning_rate": 0.003, + "loss": 4.1653, + "step": 3507 + }, + { + "epoch": 0.03508, + "grad_norm": 0.6123621534119361, + "learning_rate": 0.003, + "loss": 4.1469, + "step": 3508 + }, + { + "epoch": 0.03509, + "grad_norm": 0.529763454477017, + "learning_rate": 0.003, + "loss": 4.1565, + "step": 3509 + }, + { + "epoch": 0.0351, + "grad_norm": 0.4991440774069516, + "learning_rate": 0.003, + "loss": 4.1468, + "step": 3510 + }, + { + "epoch": 0.03511, + "grad_norm": 0.5106927805679168, + "learning_rate": 0.003, + "loss": 4.1605, + "step": 3511 + }, + { + "epoch": 0.03512, + "grad_norm": 0.5290709648712968, + "learning_rate": 0.003, + "loss": 4.1497, + "step": 3512 + }, + { + "epoch": 0.03513, + "grad_norm": 0.5838748180600412, + "learning_rate": 0.003, + "loss": 4.1534, + "step": 3513 + }, + { + "epoch": 0.03514, + "grad_norm": 0.8057246907688399, + "learning_rate": 0.003, + "loss": 4.173, + "step": 3514 + }, + { + "epoch": 0.03515, + "grad_norm": 1.0135173467038623, + "learning_rate": 0.003, + "loss": 4.1945, + "step": 3515 + }, + { + "epoch": 0.03516, + "grad_norm": 0.8727734267667479, + "learning_rate": 0.003, + "loss": 4.2288, + "step": 3516 + }, + { + "epoch": 0.03517, + "grad_norm": 0.7059545473280222, + "learning_rate": 0.003, + "loss": 4.1709, + "step": 3517 + }, + { + "epoch": 0.03518, + "grad_norm": 0.7000960321874965, + "learning_rate": 0.003, + "loss": 4.1599, + "step": 3518 + }, + { + "epoch": 0.03519, + "grad_norm": 0.6627465613911154, + "learning_rate": 0.003, + "loss": 4.2036, + "step": 3519 + }, + { + "epoch": 0.0352, + "grad_norm": 0.7079684591627484, + "learning_rate": 0.003, + "loss": 4.1815, + "step": 3520 + }, + { + "epoch": 0.03521, + "grad_norm": 0.7564247406144935, + "learning_rate": 0.003, + "loss": 4.1833, + "step": 3521 + }, + { + "epoch": 0.03522, + "grad_norm": 0.6970141789263714, + "learning_rate": 0.003, + "loss": 4.1807, + "step": 3522 + }, + { + "epoch": 0.03523, + "grad_norm": 0.6458343838785704, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 3523 + }, + { + "epoch": 0.03524, + "grad_norm": 0.727411286446386, + "learning_rate": 0.003, + "loss": 4.1943, + "step": 3524 + }, + { + "epoch": 0.03525, + "grad_norm": 0.6885063026654827, + "learning_rate": 0.003, + "loss": 4.163, + "step": 3525 + }, + { + "epoch": 0.03526, + "grad_norm": 0.503068225605494, + "learning_rate": 0.003, + "loss": 4.16, + "step": 3526 + }, + { + "epoch": 0.03527, + "grad_norm": 0.4984239052259041, + "learning_rate": 0.003, + "loss": 4.1946, + "step": 3527 + }, + { + "epoch": 0.03528, + "grad_norm": 0.48402535036066135, + "learning_rate": 0.003, + "loss": 4.1744, + "step": 3528 + }, + { + "epoch": 0.03529, + "grad_norm": 0.4945614778246426, + "learning_rate": 0.003, + "loss": 4.1679, + "step": 3529 + }, + { + "epoch": 0.0353, + "grad_norm": 0.488546537630184, + "learning_rate": 0.003, + "loss": 4.1666, + "step": 3530 + }, + { + "epoch": 0.03531, + "grad_norm": 0.4615658807568464, + "learning_rate": 0.003, + "loss": 4.173, + "step": 3531 + }, + { + "epoch": 0.03532, + "grad_norm": 0.5005896772829939, + "learning_rate": 0.003, + "loss": 4.1751, + "step": 3532 + }, + { + "epoch": 0.03533, + "grad_norm": 0.5146344370982837, + "learning_rate": 0.003, + "loss": 4.1732, + "step": 3533 + }, + { + "epoch": 0.03534, + "grad_norm": 0.5278648337419918, + "learning_rate": 0.003, + "loss": 4.1672, + "step": 3534 + }, + { + "epoch": 0.03535, + "grad_norm": 0.573214872592988, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 3535 + }, + { + "epoch": 0.03536, + "grad_norm": 0.6216412138977996, + "learning_rate": 0.003, + "loss": 4.1419, + "step": 3536 + }, + { + "epoch": 0.03537, + "grad_norm": 0.641334014656523, + "learning_rate": 0.003, + "loss": 4.1511, + "step": 3537 + }, + { + "epoch": 0.03538, + "grad_norm": 0.7120173819031035, + "learning_rate": 0.003, + "loss": 4.1587, + "step": 3538 + }, + { + "epoch": 0.03539, + "grad_norm": 0.7562087971985374, + "learning_rate": 0.003, + "loss": 4.1514, + "step": 3539 + }, + { + "epoch": 0.0354, + "grad_norm": 0.745826443201787, + "learning_rate": 0.003, + "loss": 4.1925, + "step": 3540 + }, + { + "epoch": 0.03541, + "grad_norm": 0.6880220783097426, + "learning_rate": 0.003, + "loss": 4.1777, + "step": 3541 + }, + { + "epoch": 0.03542, + "grad_norm": 0.6660987603699731, + "learning_rate": 0.003, + "loss": 4.158, + "step": 3542 + }, + { + "epoch": 0.03543, + "grad_norm": 0.5966852103308192, + "learning_rate": 0.003, + "loss": 4.1666, + "step": 3543 + }, + { + "epoch": 0.03544, + "grad_norm": 0.6147196669679205, + "learning_rate": 0.003, + "loss": 4.1752, + "step": 3544 + }, + { + "epoch": 0.03545, + "grad_norm": 0.5876025358116799, + "learning_rate": 0.003, + "loss": 4.1436, + "step": 3545 + }, + { + "epoch": 0.03546, + "grad_norm": 0.5394946467408606, + "learning_rate": 0.003, + "loss": 4.161, + "step": 3546 + }, + { + "epoch": 0.03547, + "grad_norm": 0.4760714738457914, + "learning_rate": 0.003, + "loss": 4.1659, + "step": 3547 + }, + { + "epoch": 0.03548, + "grad_norm": 0.511046276124325, + "learning_rate": 0.003, + "loss": 4.1712, + "step": 3548 + }, + { + "epoch": 0.03549, + "grad_norm": 0.5497663152303965, + "learning_rate": 0.003, + "loss": 4.1676, + "step": 3549 + }, + { + "epoch": 0.0355, + "grad_norm": 0.5734708015673942, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 3550 + }, + { + "epoch": 0.03551, + "grad_norm": 0.5704932035569535, + "learning_rate": 0.003, + "loss": 4.1799, + "step": 3551 + }, + { + "epoch": 0.03552, + "grad_norm": 0.5413961705528875, + "learning_rate": 0.003, + "loss": 4.1392, + "step": 3552 + }, + { + "epoch": 0.03553, + "grad_norm": 0.5629706262237084, + "learning_rate": 0.003, + "loss": 4.1538, + "step": 3553 + }, + { + "epoch": 0.03554, + "grad_norm": 0.6847813269812701, + "learning_rate": 0.003, + "loss": 4.1633, + "step": 3554 + }, + { + "epoch": 0.03555, + "grad_norm": 0.7442929788518159, + "learning_rate": 0.003, + "loss": 4.1792, + "step": 3555 + }, + { + "epoch": 0.03556, + "grad_norm": 0.6995396794139108, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 3556 + }, + { + "epoch": 0.03557, + "grad_norm": 0.6870977550247097, + "learning_rate": 0.003, + "loss": 4.1641, + "step": 3557 + }, + { + "epoch": 0.03558, + "grad_norm": 0.4853466992283488, + "learning_rate": 0.003, + "loss": 4.1831, + "step": 3558 + }, + { + "epoch": 0.03559, + "grad_norm": 0.4762880827808817, + "learning_rate": 0.003, + "loss": 4.1691, + "step": 3559 + }, + { + "epoch": 0.0356, + "grad_norm": 0.44888469164051914, + "learning_rate": 0.003, + "loss": 4.1691, + "step": 3560 + }, + { + "epoch": 0.03561, + "grad_norm": 0.4860772847324368, + "learning_rate": 0.003, + "loss": 4.1939, + "step": 3561 + }, + { + "epoch": 0.03562, + "grad_norm": 0.609514740868879, + "learning_rate": 0.003, + "loss": 4.1615, + "step": 3562 + }, + { + "epoch": 0.03563, + "grad_norm": 0.5645288584044179, + "learning_rate": 0.003, + "loss": 4.1919, + "step": 3563 + }, + { + "epoch": 0.03564, + "grad_norm": 0.5713508531485367, + "learning_rate": 0.003, + "loss": 4.1666, + "step": 3564 + }, + { + "epoch": 0.03565, + "grad_norm": 0.6809281314446707, + "learning_rate": 0.003, + "loss": 4.1657, + "step": 3565 + }, + { + "epoch": 0.03566, + "grad_norm": 0.8446250147895774, + "learning_rate": 0.003, + "loss": 4.1796, + "step": 3566 + }, + { + "epoch": 0.03567, + "grad_norm": 0.7816404227486906, + "learning_rate": 0.003, + "loss": 4.173, + "step": 3567 + }, + { + "epoch": 0.03568, + "grad_norm": 0.565515569510055, + "learning_rate": 0.003, + "loss": 4.1588, + "step": 3568 + }, + { + "epoch": 0.03569, + "grad_norm": 0.6392539102485247, + "learning_rate": 0.003, + "loss": 4.1604, + "step": 3569 + }, + { + "epoch": 0.0357, + "grad_norm": 0.6094618901019504, + "learning_rate": 0.003, + "loss": 4.1618, + "step": 3570 + }, + { + "epoch": 0.03571, + "grad_norm": 0.516014245189601, + "learning_rate": 0.003, + "loss": 4.1658, + "step": 3571 + }, + { + "epoch": 0.03572, + "grad_norm": 0.5649956960536694, + "learning_rate": 0.003, + "loss": 4.1874, + "step": 3572 + }, + { + "epoch": 0.03573, + "grad_norm": 0.5285488438884934, + "learning_rate": 0.003, + "loss": 4.1551, + "step": 3573 + }, + { + "epoch": 0.03574, + "grad_norm": 0.5059015354099536, + "learning_rate": 0.003, + "loss": 4.1778, + "step": 3574 + }, + { + "epoch": 0.03575, + "grad_norm": 0.5626349168929159, + "learning_rate": 0.003, + "loss": 4.1565, + "step": 3575 + }, + { + "epoch": 0.03576, + "grad_norm": 0.5730355700030897, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 3576 + }, + { + "epoch": 0.03577, + "grad_norm": 0.5081514156321213, + "learning_rate": 0.003, + "loss": 4.1719, + "step": 3577 + }, + { + "epoch": 0.03578, + "grad_norm": 0.4522800880743083, + "learning_rate": 0.003, + "loss": 4.1594, + "step": 3578 + }, + { + "epoch": 0.03579, + "grad_norm": 0.4334604254761663, + "learning_rate": 0.003, + "loss": 4.1743, + "step": 3579 + }, + { + "epoch": 0.0358, + "grad_norm": 0.478675617542924, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 3580 + }, + { + "epoch": 0.03581, + "grad_norm": 0.5103431984153175, + "learning_rate": 0.003, + "loss": 4.1461, + "step": 3581 + }, + { + "epoch": 0.03582, + "grad_norm": 0.5133704031348394, + "learning_rate": 0.003, + "loss": 4.144, + "step": 3582 + }, + { + "epoch": 0.03583, + "grad_norm": 0.5875445897748297, + "learning_rate": 0.003, + "loss": 4.1484, + "step": 3583 + }, + { + "epoch": 0.03584, + "grad_norm": 0.5543663682322499, + "learning_rate": 0.003, + "loss": 4.1383, + "step": 3584 + }, + { + "epoch": 0.03585, + "grad_norm": 0.49560990112191017, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 3585 + }, + { + "epoch": 0.03586, + "grad_norm": 0.5080536513623409, + "learning_rate": 0.003, + "loss": 4.1554, + "step": 3586 + }, + { + "epoch": 0.03587, + "grad_norm": 0.47897066887270817, + "learning_rate": 0.003, + "loss": 4.1534, + "step": 3587 + }, + { + "epoch": 0.03588, + "grad_norm": 0.4859791474646478, + "learning_rate": 0.003, + "loss": 4.1511, + "step": 3588 + }, + { + "epoch": 0.03589, + "grad_norm": 0.5360881510224248, + "learning_rate": 0.003, + "loss": 4.1696, + "step": 3589 + }, + { + "epoch": 0.0359, + "grad_norm": 0.6282942855775232, + "learning_rate": 0.003, + "loss": 4.1337, + "step": 3590 + }, + { + "epoch": 0.03591, + "grad_norm": 0.7149898709877768, + "learning_rate": 0.003, + "loss": 4.1753, + "step": 3591 + }, + { + "epoch": 0.03592, + "grad_norm": 0.6656767655390144, + "learning_rate": 0.003, + "loss": 4.1541, + "step": 3592 + }, + { + "epoch": 0.03593, + "grad_norm": 0.5744352573697437, + "learning_rate": 0.003, + "loss": 4.1525, + "step": 3593 + }, + { + "epoch": 0.03594, + "grad_norm": 0.5564608607734087, + "learning_rate": 0.003, + "loss": 4.1586, + "step": 3594 + }, + { + "epoch": 0.03595, + "grad_norm": 0.6440817185713912, + "learning_rate": 0.003, + "loss": 4.1511, + "step": 3595 + }, + { + "epoch": 0.03596, + "grad_norm": 0.8208781786922031, + "learning_rate": 0.003, + "loss": 4.1965, + "step": 3596 + }, + { + "epoch": 0.03597, + "grad_norm": 0.9222891823782106, + "learning_rate": 0.003, + "loss": 4.1825, + "step": 3597 + }, + { + "epoch": 0.03598, + "grad_norm": 0.8427211641310154, + "learning_rate": 0.003, + "loss": 4.187, + "step": 3598 + }, + { + "epoch": 0.03599, + "grad_norm": 0.852351507152875, + "learning_rate": 0.003, + "loss": 4.1882, + "step": 3599 + }, + { + "epoch": 0.036, + "grad_norm": 0.8507559173234883, + "learning_rate": 0.003, + "loss": 4.2187, + "step": 3600 + }, + { + "epoch": 0.03601, + "grad_norm": 0.9636100541843863, + "learning_rate": 0.003, + "loss": 4.1988, + "step": 3601 + }, + { + "epoch": 0.03602, + "grad_norm": 1.043053481597312, + "learning_rate": 0.003, + "loss": 4.2019, + "step": 3602 + }, + { + "epoch": 0.03603, + "grad_norm": 0.8586008867305468, + "learning_rate": 0.003, + "loss": 4.1772, + "step": 3603 + }, + { + "epoch": 0.03604, + "grad_norm": 0.8633622676072363, + "learning_rate": 0.003, + "loss": 4.2215, + "step": 3604 + }, + { + "epoch": 0.03605, + "grad_norm": 0.8221166093366499, + "learning_rate": 0.003, + "loss": 4.213, + "step": 3605 + }, + { + "epoch": 0.03606, + "grad_norm": 0.7191787485946937, + "learning_rate": 0.003, + "loss": 4.1949, + "step": 3606 + }, + { + "epoch": 0.03607, + "grad_norm": 0.6624835833221929, + "learning_rate": 0.003, + "loss": 4.1849, + "step": 3607 + }, + { + "epoch": 0.03608, + "grad_norm": 0.6824569834328487, + "learning_rate": 0.003, + "loss": 4.1883, + "step": 3608 + }, + { + "epoch": 0.03609, + "grad_norm": 0.6457675177801978, + "learning_rate": 0.003, + "loss": 4.2132, + "step": 3609 + }, + { + "epoch": 0.0361, + "grad_norm": 0.6295884180353496, + "learning_rate": 0.003, + "loss": 4.2142, + "step": 3610 + }, + { + "epoch": 0.03611, + "grad_norm": 0.590169158296821, + "learning_rate": 0.003, + "loss": 4.1598, + "step": 3611 + }, + { + "epoch": 0.03612, + "grad_norm": 0.5389921423268351, + "learning_rate": 0.003, + "loss": 4.153, + "step": 3612 + }, + { + "epoch": 0.03613, + "grad_norm": 0.5497537704200285, + "learning_rate": 0.003, + "loss": 4.1961, + "step": 3613 + }, + { + "epoch": 0.03614, + "grad_norm": 0.4843702888139074, + "learning_rate": 0.003, + "loss": 4.1672, + "step": 3614 + }, + { + "epoch": 0.03615, + "grad_norm": 0.48025803520879506, + "learning_rate": 0.003, + "loss": 4.1788, + "step": 3615 + }, + { + "epoch": 0.03616, + "grad_norm": 0.46348349208320905, + "learning_rate": 0.003, + "loss": 4.1857, + "step": 3616 + }, + { + "epoch": 0.03617, + "grad_norm": 0.39854001217238155, + "learning_rate": 0.003, + "loss": 4.1763, + "step": 3617 + }, + { + "epoch": 0.03618, + "grad_norm": 0.4027281929968038, + "learning_rate": 0.003, + "loss": 4.1671, + "step": 3618 + }, + { + "epoch": 0.03619, + "grad_norm": 0.3985420323335125, + "learning_rate": 0.003, + "loss": 4.1636, + "step": 3619 + }, + { + "epoch": 0.0362, + "grad_norm": 0.3613468610024924, + "learning_rate": 0.003, + "loss": 4.1676, + "step": 3620 + }, + { + "epoch": 0.03621, + "grad_norm": 0.33261587429198597, + "learning_rate": 0.003, + "loss": 4.1667, + "step": 3621 + }, + { + "epoch": 0.03622, + "grad_norm": 0.3403864350330926, + "learning_rate": 0.003, + "loss": 4.1353, + "step": 3622 + }, + { + "epoch": 0.03623, + "grad_norm": 0.3583311023359255, + "learning_rate": 0.003, + "loss": 4.1243, + "step": 3623 + }, + { + "epoch": 0.03624, + "grad_norm": 0.32450649557353767, + "learning_rate": 0.003, + "loss": 4.1298, + "step": 3624 + }, + { + "epoch": 0.03625, + "grad_norm": 0.3235232357698917, + "learning_rate": 0.003, + "loss": 4.1676, + "step": 3625 + }, + { + "epoch": 0.03626, + "grad_norm": 0.3029552071156344, + "learning_rate": 0.003, + "loss": 4.1418, + "step": 3626 + }, + { + "epoch": 0.03627, + "grad_norm": 0.3634806878840478, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 3627 + }, + { + "epoch": 0.03628, + "grad_norm": 0.5031788929713537, + "learning_rate": 0.003, + "loss": 4.1538, + "step": 3628 + }, + { + "epoch": 0.03629, + "grad_norm": 0.7314661052924161, + "learning_rate": 0.003, + "loss": 4.1821, + "step": 3629 + }, + { + "epoch": 0.0363, + "grad_norm": 0.9369830066901181, + "learning_rate": 0.003, + "loss": 4.1775, + "step": 3630 + }, + { + "epoch": 0.03631, + "grad_norm": 0.7563300299386175, + "learning_rate": 0.003, + "loss": 4.1699, + "step": 3631 + }, + { + "epoch": 0.03632, + "grad_norm": 0.6272780473551262, + "learning_rate": 0.003, + "loss": 4.1574, + "step": 3632 + }, + { + "epoch": 0.03633, + "grad_norm": 0.6625289405061192, + "learning_rate": 0.003, + "loss": 4.1907, + "step": 3633 + }, + { + "epoch": 0.03634, + "grad_norm": 0.624342554157624, + "learning_rate": 0.003, + "loss": 4.1627, + "step": 3634 + }, + { + "epoch": 0.03635, + "grad_norm": 0.583607515091326, + "learning_rate": 0.003, + "loss": 4.149, + "step": 3635 + }, + { + "epoch": 0.03636, + "grad_norm": 0.5584832759878093, + "learning_rate": 0.003, + "loss": 4.1652, + "step": 3636 + }, + { + "epoch": 0.03637, + "grad_norm": 0.5837086725119016, + "learning_rate": 0.003, + "loss": 4.1561, + "step": 3637 + }, + { + "epoch": 0.03638, + "grad_norm": 0.6095521315464627, + "learning_rate": 0.003, + "loss": 4.1649, + "step": 3638 + }, + { + "epoch": 0.03639, + "grad_norm": 0.6657809000387206, + "learning_rate": 0.003, + "loss": 4.1922, + "step": 3639 + }, + { + "epoch": 0.0364, + "grad_norm": 0.6289647060691178, + "learning_rate": 0.003, + "loss": 4.1882, + "step": 3640 + }, + { + "epoch": 0.03641, + "grad_norm": 0.6059605963156967, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 3641 + }, + { + "epoch": 0.03642, + "grad_norm": 0.5705396060730448, + "learning_rate": 0.003, + "loss": 4.1875, + "step": 3642 + }, + { + "epoch": 0.03643, + "grad_norm": 0.6035912533971294, + "learning_rate": 0.003, + "loss": 4.1489, + "step": 3643 + }, + { + "epoch": 0.03644, + "grad_norm": 0.6309633478231041, + "learning_rate": 0.003, + "loss": 4.1769, + "step": 3644 + }, + { + "epoch": 0.03645, + "grad_norm": 0.6355334191850317, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 3645 + }, + { + "epoch": 0.03646, + "grad_norm": 0.5786595124970733, + "learning_rate": 0.003, + "loss": 4.1657, + "step": 3646 + }, + { + "epoch": 0.03647, + "grad_norm": 0.5949997765584221, + "learning_rate": 0.003, + "loss": 4.1642, + "step": 3647 + }, + { + "epoch": 0.03648, + "grad_norm": 0.5464060834426805, + "learning_rate": 0.003, + "loss": 4.1502, + "step": 3648 + }, + { + "epoch": 0.03649, + "grad_norm": 0.6161442180745826, + "learning_rate": 0.003, + "loss": 4.1777, + "step": 3649 + }, + { + "epoch": 0.0365, + "grad_norm": 0.7276354515874378, + "learning_rate": 0.003, + "loss": 4.1605, + "step": 3650 + }, + { + "epoch": 0.03651, + "grad_norm": 0.879246794128947, + "learning_rate": 0.003, + "loss": 4.1938, + "step": 3651 + }, + { + "epoch": 0.03652, + "grad_norm": 0.8889526606107412, + "learning_rate": 0.003, + "loss": 4.1834, + "step": 3652 + }, + { + "epoch": 0.03653, + "grad_norm": 0.7646404951633242, + "learning_rate": 0.003, + "loss": 4.162, + "step": 3653 + }, + { + "epoch": 0.03654, + "grad_norm": 0.7290317207893501, + "learning_rate": 0.003, + "loss": 4.1597, + "step": 3654 + }, + { + "epoch": 0.03655, + "grad_norm": 0.7063774172804314, + "learning_rate": 0.003, + "loss": 4.1865, + "step": 3655 + }, + { + "epoch": 0.03656, + "grad_norm": 0.7160866471184082, + "learning_rate": 0.003, + "loss": 4.1743, + "step": 3656 + }, + { + "epoch": 0.03657, + "grad_norm": 0.5821614514235085, + "learning_rate": 0.003, + "loss": 4.1421, + "step": 3657 + }, + { + "epoch": 0.03658, + "grad_norm": 0.5963209078366205, + "learning_rate": 0.003, + "loss": 4.2061, + "step": 3658 + }, + { + "epoch": 0.03659, + "grad_norm": 0.5885163411512104, + "learning_rate": 0.003, + "loss": 4.1544, + "step": 3659 + }, + { + "epoch": 0.0366, + "grad_norm": 0.6724966576985278, + "learning_rate": 0.003, + "loss": 4.1869, + "step": 3660 + }, + { + "epoch": 0.03661, + "grad_norm": 0.70045611677563, + "learning_rate": 0.003, + "loss": 4.1682, + "step": 3661 + }, + { + "epoch": 0.03662, + "grad_norm": 0.7038628116184068, + "learning_rate": 0.003, + "loss": 4.1668, + "step": 3662 + }, + { + "epoch": 0.03663, + "grad_norm": 0.7338499926784156, + "learning_rate": 0.003, + "loss": 4.1859, + "step": 3663 + }, + { + "epoch": 0.03664, + "grad_norm": 0.6003408827366159, + "learning_rate": 0.003, + "loss": 4.1599, + "step": 3664 + }, + { + "epoch": 0.03665, + "grad_norm": 0.531193738984965, + "learning_rate": 0.003, + "loss": 4.1816, + "step": 3665 + }, + { + "epoch": 0.03666, + "grad_norm": 0.502967345745313, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 3666 + }, + { + "epoch": 0.03667, + "grad_norm": 0.4403238489897598, + "learning_rate": 0.003, + "loss": 4.1421, + "step": 3667 + }, + { + "epoch": 0.03668, + "grad_norm": 0.4214554821381715, + "learning_rate": 0.003, + "loss": 4.1542, + "step": 3668 + }, + { + "epoch": 0.03669, + "grad_norm": 0.3786750496308303, + "learning_rate": 0.003, + "loss": 4.129, + "step": 3669 + }, + { + "epoch": 0.0367, + "grad_norm": 0.36562759077361934, + "learning_rate": 0.003, + "loss": 4.1666, + "step": 3670 + }, + { + "epoch": 0.03671, + "grad_norm": 0.4132800551107581, + "learning_rate": 0.003, + "loss": 4.1551, + "step": 3671 + }, + { + "epoch": 0.03672, + "grad_norm": 0.49256525507220217, + "learning_rate": 0.003, + "loss": 4.1596, + "step": 3672 + }, + { + "epoch": 0.03673, + "grad_norm": 0.6349831894939928, + "learning_rate": 0.003, + "loss": 4.171, + "step": 3673 + }, + { + "epoch": 0.03674, + "grad_norm": 0.6952242122376082, + "learning_rate": 0.003, + "loss": 4.1369, + "step": 3674 + }, + { + "epoch": 0.03675, + "grad_norm": 0.5660824888307048, + "learning_rate": 0.003, + "loss": 4.1736, + "step": 3675 + }, + { + "epoch": 0.03676, + "grad_norm": 0.4101727822433613, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 3676 + }, + { + "epoch": 0.03677, + "grad_norm": 0.4248844160991166, + "learning_rate": 0.003, + "loss": 4.1421, + "step": 3677 + }, + { + "epoch": 0.03678, + "grad_norm": 0.40692106726610927, + "learning_rate": 0.003, + "loss": 4.1681, + "step": 3678 + }, + { + "epoch": 0.03679, + "grad_norm": 0.43656051390996525, + "learning_rate": 0.003, + "loss": 4.1576, + "step": 3679 + }, + { + "epoch": 0.0368, + "grad_norm": 0.4651408055341787, + "learning_rate": 0.003, + "loss": 4.1504, + "step": 3680 + }, + { + "epoch": 0.03681, + "grad_norm": 0.4431552362299314, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 3681 + }, + { + "epoch": 0.03682, + "grad_norm": 0.45670367087714675, + "learning_rate": 0.003, + "loss": 4.1622, + "step": 3682 + }, + { + "epoch": 0.03683, + "grad_norm": 0.48732877269273184, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 3683 + }, + { + "epoch": 0.03684, + "grad_norm": 0.49569158763050314, + "learning_rate": 0.003, + "loss": 4.1464, + "step": 3684 + }, + { + "epoch": 0.03685, + "grad_norm": 0.5827537649992007, + "learning_rate": 0.003, + "loss": 4.1345, + "step": 3685 + }, + { + "epoch": 0.03686, + "grad_norm": 0.7130056023766307, + "learning_rate": 0.003, + "loss": 4.1622, + "step": 3686 + }, + { + "epoch": 0.03687, + "grad_norm": 0.806080078993977, + "learning_rate": 0.003, + "loss": 4.1913, + "step": 3687 + }, + { + "epoch": 0.03688, + "grad_norm": 0.8654301422344852, + "learning_rate": 0.003, + "loss": 4.1853, + "step": 3688 + }, + { + "epoch": 0.03689, + "grad_norm": 0.824342247624797, + "learning_rate": 0.003, + "loss": 4.1558, + "step": 3689 + }, + { + "epoch": 0.0369, + "grad_norm": 0.8399398898767244, + "learning_rate": 0.003, + "loss": 4.1897, + "step": 3690 + }, + { + "epoch": 0.03691, + "grad_norm": 0.8376909355521104, + "learning_rate": 0.003, + "loss": 4.1757, + "step": 3691 + }, + { + "epoch": 0.03692, + "grad_norm": 0.7664237094520844, + "learning_rate": 0.003, + "loss": 4.2095, + "step": 3692 + }, + { + "epoch": 0.03693, + "grad_norm": 0.6756940227740064, + "learning_rate": 0.003, + "loss": 4.1782, + "step": 3693 + }, + { + "epoch": 0.03694, + "grad_norm": 0.665602669017162, + "learning_rate": 0.003, + "loss": 4.1706, + "step": 3694 + }, + { + "epoch": 0.03695, + "grad_norm": 0.6845010372537103, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 3695 + }, + { + "epoch": 0.03696, + "grad_norm": 0.8052078414912427, + "learning_rate": 0.003, + "loss": 4.1755, + "step": 3696 + }, + { + "epoch": 0.03697, + "grad_norm": 0.8787348855409773, + "learning_rate": 0.003, + "loss": 4.1745, + "step": 3697 + }, + { + "epoch": 0.03698, + "grad_norm": 0.8247546282969409, + "learning_rate": 0.003, + "loss": 4.2121, + "step": 3698 + }, + { + "epoch": 0.03699, + "grad_norm": 0.7409801258428377, + "learning_rate": 0.003, + "loss": 4.1946, + "step": 3699 + }, + { + "epoch": 0.037, + "grad_norm": 0.6088451406774416, + "learning_rate": 0.003, + "loss": 4.1795, + "step": 3700 + }, + { + "epoch": 0.03701, + "grad_norm": 0.6322208971568581, + "learning_rate": 0.003, + "loss": 4.1844, + "step": 3701 + }, + { + "epoch": 0.03702, + "grad_norm": 0.5054476642910108, + "learning_rate": 0.003, + "loss": 4.1646, + "step": 3702 + }, + { + "epoch": 0.03703, + "grad_norm": 0.5443386034292382, + "learning_rate": 0.003, + "loss": 4.1798, + "step": 3703 + }, + { + "epoch": 0.03704, + "grad_norm": 0.5944500200700061, + "learning_rate": 0.003, + "loss": 4.1823, + "step": 3704 + }, + { + "epoch": 0.03705, + "grad_norm": 0.600099616356784, + "learning_rate": 0.003, + "loss": 4.1957, + "step": 3705 + }, + { + "epoch": 0.03706, + "grad_norm": 0.6109788360926119, + "learning_rate": 0.003, + "loss": 4.182, + "step": 3706 + }, + { + "epoch": 0.03707, + "grad_norm": 0.5293706856345912, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 3707 + }, + { + "epoch": 0.03708, + "grad_norm": 0.47791373263744796, + "learning_rate": 0.003, + "loss": 4.1753, + "step": 3708 + }, + { + "epoch": 0.03709, + "grad_norm": 0.49156286964072154, + "learning_rate": 0.003, + "loss": 4.1505, + "step": 3709 + }, + { + "epoch": 0.0371, + "grad_norm": 0.5248101152432202, + "learning_rate": 0.003, + "loss": 4.1867, + "step": 3710 + }, + { + "epoch": 0.03711, + "grad_norm": 0.6117031890743618, + "learning_rate": 0.003, + "loss": 4.1604, + "step": 3711 + }, + { + "epoch": 0.03712, + "grad_norm": 0.6254066152554677, + "learning_rate": 0.003, + "loss": 4.1614, + "step": 3712 + }, + { + "epoch": 0.03713, + "grad_norm": 0.6264703701067362, + "learning_rate": 0.003, + "loss": 4.1504, + "step": 3713 + }, + { + "epoch": 0.03714, + "grad_norm": 0.620653200418187, + "learning_rate": 0.003, + "loss": 4.1614, + "step": 3714 + }, + { + "epoch": 0.03715, + "grad_norm": 0.5248205552325349, + "learning_rate": 0.003, + "loss": 4.1655, + "step": 3715 + }, + { + "epoch": 0.03716, + "grad_norm": 0.5526084707582196, + "learning_rate": 0.003, + "loss": 4.1555, + "step": 3716 + }, + { + "epoch": 0.03717, + "grad_norm": 0.49730839862475784, + "learning_rate": 0.003, + "loss": 4.1545, + "step": 3717 + }, + { + "epoch": 0.03718, + "grad_norm": 0.462962673452178, + "learning_rate": 0.003, + "loss": 4.1258, + "step": 3718 + }, + { + "epoch": 0.03719, + "grad_norm": 0.3930444861352458, + "learning_rate": 0.003, + "loss": 4.1539, + "step": 3719 + }, + { + "epoch": 0.0372, + "grad_norm": 0.40801634488970573, + "learning_rate": 0.003, + "loss": 4.1555, + "step": 3720 + }, + { + "epoch": 0.03721, + "grad_norm": 0.44606614802333183, + "learning_rate": 0.003, + "loss": 4.1436, + "step": 3721 + }, + { + "epoch": 0.03722, + "grad_norm": 0.39111226545505906, + "learning_rate": 0.003, + "loss": 4.1429, + "step": 3722 + }, + { + "epoch": 0.03723, + "grad_norm": 0.4408949322752162, + "learning_rate": 0.003, + "loss": 4.1757, + "step": 3723 + }, + { + "epoch": 0.03724, + "grad_norm": 0.5569410843865424, + "learning_rate": 0.003, + "loss": 4.156, + "step": 3724 + }, + { + "epoch": 0.03725, + "grad_norm": 0.6820788875058175, + "learning_rate": 0.003, + "loss": 4.1691, + "step": 3725 + }, + { + "epoch": 0.03726, + "grad_norm": 0.7697397803396343, + "learning_rate": 0.003, + "loss": 4.1495, + "step": 3726 + }, + { + "epoch": 0.03727, + "grad_norm": 0.7112467858706011, + "learning_rate": 0.003, + "loss": 4.1828, + "step": 3727 + }, + { + "epoch": 0.03728, + "grad_norm": 0.6285060537833618, + "learning_rate": 0.003, + "loss": 4.1298, + "step": 3728 + }, + { + "epoch": 0.03729, + "grad_norm": 0.5988137930316242, + "learning_rate": 0.003, + "loss": 4.1659, + "step": 3729 + }, + { + "epoch": 0.0373, + "grad_norm": 0.6000038779888225, + "learning_rate": 0.003, + "loss": 4.1483, + "step": 3730 + }, + { + "epoch": 0.03731, + "grad_norm": 0.5727776342995737, + "learning_rate": 0.003, + "loss": 4.1728, + "step": 3731 + }, + { + "epoch": 0.03732, + "grad_norm": 0.47697561747753886, + "learning_rate": 0.003, + "loss": 4.1673, + "step": 3732 + }, + { + "epoch": 0.03733, + "grad_norm": 0.44272447557143685, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 3733 + }, + { + "epoch": 0.03734, + "grad_norm": 0.4247027107702777, + "learning_rate": 0.003, + "loss": 4.145, + "step": 3734 + }, + { + "epoch": 0.03735, + "grad_norm": 0.4479925161214012, + "learning_rate": 0.003, + "loss": 4.1383, + "step": 3735 + }, + { + "epoch": 0.03736, + "grad_norm": 0.45973685869578795, + "learning_rate": 0.003, + "loss": 4.1686, + "step": 3736 + }, + { + "epoch": 0.03737, + "grad_norm": 0.3920498911497691, + "learning_rate": 0.003, + "loss": 4.1531, + "step": 3737 + }, + { + "epoch": 0.03738, + "grad_norm": 0.4063251593566824, + "learning_rate": 0.003, + "loss": 4.1715, + "step": 3738 + }, + { + "epoch": 0.03739, + "grad_norm": 0.46745772862008544, + "learning_rate": 0.003, + "loss": 4.135, + "step": 3739 + }, + { + "epoch": 0.0374, + "grad_norm": 0.46296090379338223, + "learning_rate": 0.003, + "loss": 4.1535, + "step": 3740 + }, + { + "epoch": 0.03741, + "grad_norm": 0.512997084501291, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 3741 + }, + { + "epoch": 0.03742, + "grad_norm": 0.6523245986952474, + "learning_rate": 0.003, + "loss": 4.1366, + "step": 3742 + }, + { + "epoch": 0.03743, + "grad_norm": 0.7442806374508512, + "learning_rate": 0.003, + "loss": 4.1373, + "step": 3743 + }, + { + "epoch": 0.03744, + "grad_norm": 0.7320455988631113, + "learning_rate": 0.003, + "loss": 4.1716, + "step": 3744 + }, + { + "epoch": 0.03745, + "grad_norm": 0.7624968676067105, + "learning_rate": 0.003, + "loss": 4.1564, + "step": 3745 + }, + { + "epoch": 0.03746, + "grad_norm": 0.7031928628117834, + "learning_rate": 0.003, + "loss": 4.1647, + "step": 3746 + }, + { + "epoch": 0.03747, + "grad_norm": 0.6687793682231787, + "learning_rate": 0.003, + "loss": 4.1679, + "step": 3747 + }, + { + "epoch": 0.03748, + "grad_norm": 0.779694731392804, + "learning_rate": 0.003, + "loss": 4.1488, + "step": 3748 + }, + { + "epoch": 0.03749, + "grad_norm": 0.7516521349267218, + "learning_rate": 0.003, + "loss": 4.1655, + "step": 3749 + }, + { + "epoch": 0.0375, + "grad_norm": 0.7321877471048407, + "learning_rate": 0.003, + "loss": 4.186, + "step": 3750 + }, + { + "epoch": 0.03751, + "grad_norm": 0.7130679243817225, + "learning_rate": 0.003, + "loss": 4.1396, + "step": 3751 + }, + { + "epoch": 0.03752, + "grad_norm": 0.7452672741472625, + "learning_rate": 0.003, + "loss": 4.1674, + "step": 3752 + }, + { + "epoch": 0.03753, + "grad_norm": 0.6724770863912444, + "learning_rate": 0.003, + "loss": 4.1729, + "step": 3753 + }, + { + "epoch": 0.03754, + "grad_norm": 0.556817823672953, + "learning_rate": 0.003, + "loss": 4.1762, + "step": 3754 + }, + { + "epoch": 0.03755, + "grad_norm": 0.5579634308288945, + "learning_rate": 0.003, + "loss": 4.1683, + "step": 3755 + }, + { + "epoch": 0.03756, + "grad_norm": 0.6263580644537361, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 3756 + }, + { + "epoch": 0.03757, + "grad_norm": 0.6400148872312791, + "learning_rate": 0.003, + "loss": 4.176, + "step": 3757 + }, + { + "epoch": 0.03758, + "grad_norm": 0.6369106854442147, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 3758 + }, + { + "epoch": 0.03759, + "grad_norm": 0.5476477808333663, + "learning_rate": 0.003, + "loss": 4.1735, + "step": 3759 + }, + { + "epoch": 0.0376, + "grad_norm": 0.6773838858228983, + "learning_rate": 0.003, + "loss": 4.1634, + "step": 3760 + }, + { + "epoch": 0.03761, + "grad_norm": 0.7718542722389751, + "learning_rate": 0.003, + "loss": 4.1379, + "step": 3761 + }, + { + "epoch": 0.03762, + "grad_norm": 0.6657955446674061, + "learning_rate": 0.003, + "loss": 4.1909, + "step": 3762 + }, + { + "epoch": 0.03763, + "grad_norm": 0.6194620722466222, + "learning_rate": 0.003, + "loss": 4.172, + "step": 3763 + }, + { + "epoch": 0.03764, + "grad_norm": 0.6595957894013659, + "learning_rate": 0.003, + "loss": 4.1858, + "step": 3764 + }, + { + "epoch": 0.03765, + "grad_norm": 0.7419338588721475, + "learning_rate": 0.003, + "loss": 4.2111, + "step": 3765 + }, + { + "epoch": 0.03766, + "grad_norm": 0.7575024078950962, + "learning_rate": 0.003, + "loss": 4.1527, + "step": 3766 + }, + { + "epoch": 0.03767, + "grad_norm": 0.6434009404162437, + "learning_rate": 0.003, + "loss": 4.1896, + "step": 3767 + }, + { + "epoch": 0.03768, + "grad_norm": 0.6051033538599371, + "learning_rate": 0.003, + "loss": 4.1655, + "step": 3768 + }, + { + "epoch": 0.03769, + "grad_norm": 0.581660518608867, + "learning_rate": 0.003, + "loss": 4.1566, + "step": 3769 + }, + { + "epoch": 0.0377, + "grad_norm": 0.47329598157123676, + "learning_rate": 0.003, + "loss": 4.1315, + "step": 3770 + }, + { + "epoch": 0.03771, + "grad_norm": 0.5353355178607329, + "learning_rate": 0.003, + "loss": 4.1525, + "step": 3771 + }, + { + "epoch": 0.03772, + "grad_norm": 0.5081985686041285, + "learning_rate": 0.003, + "loss": 4.1668, + "step": 3772 + }, + { + "epoch": 0.03773, + "grad_norm": 0.4994313050826756, + "learning_rate": 0.003, + "loss": 4.1748, + "step": 3773 + }, + { + "epoch": 0.03774, + "grad_norm": 0.48919774898243473, + "learning_rate": 0.003, + "loss": 4.1567, + "step": 3774 + }, + { + "epoch": 0.03775, + "grad_norm": 0.46915381954128726, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 3775 + }, + { + "epoch": 0.03776, + "grad_norm": 0.46730858925326924, + "learning_rate": 0.003, + "loss": 4.1476, + "step": 3776 + }, + { + "epoch": 0.03777, + "grad_norm": 0.4868019030635906, + "learning_rate": 0.003, + "loss": 4.1632, + "step": 3777 + }, + { + "epoch": 0.03778, + "grad_norm": 0.4826030300886077, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 3778 + }, + { + "epoch": 0.03779, + "grad_norm": 0.4347204008546579, + "learning_rate": 0.003, + "loss": 4.151, + "step": 3779 + }, + { + "epoch": 0.0378, + "grad_norm": 0.4013702620235084, + "learning_rate": 0.003, + "loss": 4.149, + "step": 3780 + }, + { + "epoch": 0.03781, + "grad_norm": 0.4909799595844497, + "learning_rate": 0.003, + "loss": 4.1807, + "step": 3781 + }, + { + "epoch": 0.03782, + "grad_norm": 0.5827570172997248, + "learning_rate": 0.003, + "loss": 4.1562, + "step": 3782 + }, + { + "epoch": 0.03783, + "grad_norm": 0.7057652047796394, + "learning_rate": 0.003, + "loss": 4.1621, + "step": 3783 + }, + { + "epoch": 0.03784, + "grad_norm": 0.7176846202765269, + "learning_rate": 0.003, + "loss": 4.1645, + "step": 3784 + }, + { + "epoch": 0.03785, + "grad_norm": 0.6227082605371008, + "learning_rate": 0.003, + "loss": 4.1465, + "step": 3785 + }, + { + "epoch": 0.03786, + "grad_norm": 0.5851856366466184, + "learning_rate": 0.003, + "loss": 4.1631, + "step": 3786 + }, + { + "epoch": 0.03787, + "grad_norm": 0.5827445998978293, + "learning_rate": 0.003, + "loss": 4.1496, + "step": 3787 + }, + { + "epoch": 0.03788, + "grad_norm": 0.5961875200035882, + "learning_rate": 0.003, + "loss": 4.1832, + "step": 3788 + }, + { + "epoch": 0.03789, + "grad_norm": 0.5599813731691621, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 3789 + }, + { + "epoch": 0.0379, + "grad_norm": 0.5457520494795979, + "learning_rate": 0.003, + "loss": 4.137, + "step": 3790 + }, + { + "epoch": 0.03791, + "grad_norm": 0.5890367342982761, + "learning_rate": 0.003, + "loss": 4.152, + "step": 3791 + }, + { + "epoch": 0.03792, + "grad_norm": 0.6819220498588953, + "learning_rate": 0.003, + "loss": 4.1877, + "step": 3792 + }, + { + "epoch": 0.03793, + "grad_norm": 0.7245933962883803, + "learning_rate": 0.003, + "loss": 4.1838, + "step": 3793 + }, + { + "epoch": 0.03794, + "grad_norm": 0.6330889319321834, + "learning_rate": 0.003, + "loss": 4.1803, + "step": 3794 + }, + { + "epoch": 0.03795, + "grad_norm": 0.6825420551312804, + "learning_rate": 0.003, + "loss": 4.1655, + "step": 3795 + }, + { + "epoch": 0.03796, + "grad_norm": 0.7213382120258517, + "learning_rate": 0.003, + "loss": 4.1496, + "step": 3796 + }, + { + "epoch": 0.03797, + "grad_norm": 0.753053475755318, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 3797 + }, + { + "epoch": 0.03798, + "grad_norm": 0.8024496845778784, + "learning_rate": 0.003, + "loss": 4.163, + "step": 3798 + }, + { + "epoch": 0.03799, + "grad_norm": 0.6761853816469585, + "learning_rate": 0.003, + "loss": 4.1679, + "step": 3799 + }, + { + "epoch": 0.038, + "grad_norm": 0.5760561821833747, + "learning_rate": 0.003, + "loss": 4.1741, + "step": 3800 + }, + { + "epoch": 0.03801, + "grad_norm": 0.5877988869253646, + "learning_rate": 0.003, + "loss": 4.1763, + "step": 3801 + }, + { + "epoch": 0.03802, + "grad_norm": 0.6488463807188865, + "learning_rate": 0.003, + "loss": 4.1519, + "step": 3802 + }, + { + "epoch": 0.03803, + "grad_norm": 0.5517421440712098, + "learning_rate": 0.003, + "loss": 4.177, + "step": 3803 + }, + { + "epoch": 0.03804, + "grad_norm": 0.5096073674154153, + "learning_rate": 0.003, + "loss": 4.1455, + "step": 3804 + }, + { + "epoch": 0.03805, + "grad_norm": 0.5032605295974328, + "learning_rate": 0.003, + "loss": 4.1606, + "step": 3805 + }, + { + "epoch": 0.03806, + "grad_norm": 0.515161268146513, + "learning_rate": 0.003, + "loss": 4.1599, + "step": 3806 + }, + { + "epoch": 0.03807, + "grad_norm": 0.5868281956315563, + "learning_rate": 0.003, + "loss": 4.154, + "step": 3807 + }, + { + "epoch": 0.03808, + "grad_norm": 0.6126977458856417, + "learning_rate": 0.003, + "loss": 4.1693, + "step": 3808 + }, + { + "epoch": 0.03809, + "grad_norm": 0.5958831138264383, + "learning_rate": 0.003, + "loss": 4.1509, + "step": 3809 + }, + { + "epoch": 0.0381, + "grad_norm": 0.5125381608413851, + "learning_rate": 0.003, + "loss": 4.1337, + "step": 3810 + }, + { + "epoch": 0.03811, + "grad_norm": 0.46806634569742234, + "learning_rate": 0.003, + "loss": 4.1638, + "step": 3811 + }, + { + "epoch": 0.03812, + "grad_norm": 0.4708164911957734, + "learning_rate": 0.003, + "loss": 4.1534, + "step": 3812 + }, + { + "epoch": 0.03813, + "grad_norm": 0.4451707961484543, + "learning_rate": 0.003, + "loss": 4.1249, + "step": 3813 + }, + { + "epoch": 0.03814, + "grad_norm": 0.440545036907673, + "learning_rate": 0.003, + "loss": 4.1549, + "step": 3814 + }, + { + "epoch": 0.03815, + "grad_norm": 0.4171348679565834, + "learning_rate": 0.003, + "loss": 4.1429, + "step": 3815 + }, + { + "epoch": 0.03816, + "grad_norm": 0.553176653360458, + "learning_rate": 0.003, + "loss": 4.148, + "step": 3816 + }, + { + "epoch": 0.03817, + "grad_norm": 0.8249856806613923, + "learning_rate": 0.003, + "loss": 4.1371, + "step": 3817 + }, + { + "epoch": 0.03818, + "grad_norm": 1.0849772720701052, + "learning_rate": 0.003, + "loss": 4.2073, + "step": 3818 + }, + { + "epoch": 0.03819, + "grad_norm": 0.8469500316270079, + "learning_rate": 0.003, + "loss": 4.1542, + "step": 3819 + }, + { + "epoch": 0.0382, + "grad_norm": 0.707642322044435, + "learning_rate": 0.003, + "loss": 4.2001, + "step": 3820 + }, + { + "epoch": 0.03821, + "grad_norm": 0.747958147518101, + "learning_rate": 0.003, + "loss": 4.1478, + "step": 3821 + }, + { + "epoch": 0.03822, + "grad_norm": 0.6648920321229015, + "learning_rate": 0.003, + "loss": 4.169, + "step": 3822 + }, + { + "epoch": 0.03823, + "grad_norm": 0.6528306662073633, + "learning_rate": 0.003, + "loss": 4.1635, + "step": 3823 + }, + { + "epoch": 0.03824, + "grad_norm": 0.760735694975771, + "learning_rate": 0.003, + "loss": 4.1651, + "step": 3824 + }, + { + "epoch": 0.03825, + "grad_norm": 0.8622307424344766, + "learning_rate": 0.003, + "loss": 4.185, + "step": 3825 + }, + { + "epoch": 0.03826, + "grad_norm": 0.8866334244520664, + "learning_rate": 0.003, + "loss": 4.1792, + "step": 3826 + }, + { + "epoch": 0.03827, + "grad_norm": 0.8924274619717957, + "learning_rate": 0.003, + "loss": 4.206, + "step": 3827 + }, + { + "epoch": 0.03828, + "grad_norm": 0.7065036502476953, + "learning_rate": 0.003, + "loss": 4.178, + "step": 3828 + }, + { + "epoch": 0.03829, + "grad_norm": 0.6969933866633966, + "learning_rate": 0.003, + "loss": 4.1837, + "step": 3829 + }, + { + "epoch": 0.0383, + "grad_norm": 0.6465184385629209, + "learning_rate": 0.003, + "loss": 4.1679, + "step": 3830 + }, + { + "epoch": 0.03831, + "grad_norm": 0.7285929638974894, + "learning_rate": 0.003, + "loss": 4.1731, + "step": 3831 + }, + { + "epoch": 0.03832, + "grad_norm": 0.7578366971294306, + "learning_rate": 0.003, + "loss": 4.1714, + "step": 3832 + }, + { + "epoch": 0.03833, + "grad_norm": 0.7591256877673351, + "learning_rate": 0.003, + "loss": 4.173, + "step": 3833 + }, + { + "epoch": 0.03834, + "grad_norm": 0.6570705623790875, + "learning_rate": 0.003, + "loss": 4.2013, + "step": 3834 + }, + { + "epoch": 0.03835, + "grad_norm": 0.6338048148231775, + "learning_rate": 0.003, + "loss": 4.1848, + "step": 3835 + }, + { + "epoch": 0.03836, + "grad_norm": 0.579305584931518, + "learning_rate": 0.003, + "loss": 4.1872, + "step": 3836 + }, + { + "epoch": 0.03837, + "grad_norm": 0.5000040915409231, + "learning_rate": 0.003, + "loss": 4.1685, + "step": 3837 + }, + { + "epoch": 0.03838, + "grad_norm": 0.48980765754438066, + "learning_rate": 0.003, + "loss": 4.1528, + "step": 3838 + }, + { + "epoch": 0.03839, + "grad_norm": 0.44391262328684045, + "learning_rate": 0.003, + "loss": 4.1591, + "step": 3839 + }, + { + "epoch": 0.0384, + "grad_norm": 0.4581943030154349, + "learning_rate": 0.003, + "loss": 4.1629, + "step": 3840 + }, + { + "epoch": 0.03841, + "grad_norm": 0.4824433708702651, + "learning_rate": 0.003, + "loss": 4.1634, + "step": 3841 + }, + { + "epoch": 0.03842, + "grad_norm": 0.4830219498091642, + "learning_rate": 0.003, + "loss": 4.1638, + "step": 3842 + }, + { + "epoch": 0.03843, + "grad_norm": 0.45884626845612136, + "learning_rate": 0.003, + "loss": 4.1489, + "step": 3843 + }, + { + "epoch": 0.03844, + "grad_norm": 0.3942504840023139, + "learning_rate": 0.003, + "loss": 4.1542, + "step": 3844 + }, + { + "epoch": 0.03845, + "grad_norm": 0.35673387851966176, + "learning_rate": 0.003, + "loss": 4.1521, + "step": 3845 + }, + { + "epoch": 0.03846, + "grad_norm": 0.3480023206810377, + "learning_rate": 0.003, + "loss": 4.1675, + "step": 3846 + }, + { + "epoch": 0.03847, + "grad_norm": 0.30772952512029117, + "learning_rate": 0.003, + "loss": 4.153, + "step": 3847 + }, + { + "epoch": 0.03848, + "grad_norm": 0.351053429769811, + "learning_rate": 0.003, + "loss": 4.138, + "step": 3848 + }, + { + "epoch": 0.03849, + "grad_norm": 0.3486839774780466, + "learning_rate": 0.003, + "loss": 4.1509, + "step": 3849 + }, + { + "epoch": 0.0385, + "grad_norm": 0.41203805300045565, + "learning_rate": 0.003, + "loss": 4.1624, + "step": 3850 + }, + { + "epoch": 0.03851, + "grad_norm": 0.5052943186225257, + "learning_rate": 0.003, + "loss": 4.1259, + "step": 3851 + }, + { + "epoch": 0.03852, + "grad_norm": 0.6267509920431387, + "learning_rate": 0.003, + "loss": 4.167, + "step": 3852 + }, + { + "epoch": 0.03853, + "grad_norm": 0.7787688899298432, + "learning_rate": 0.003, + "loss": 4.1691, + "step": 3853 + }, + { + "epoch": 0.03854, + "grad_norm": 0.7725109769746114, + "learning_rate": 0.003, + "loss": 4.1654, + "step": 3854 + }, + { + "epoch": 0.03855, + "grad_norm": 0.6052355534312069, + "learning_rate": 0.003, + "loss": 4.1757, + "step": 3855 + }, + { + "epoch": 0.03856, + "grad_norm": 0.6299503142898346, + "learning_rate": 0.003, + "loss": 4.1483, + "step": 3856 + }, + { + "epoch": 0.03857, + "grad_norm": 0.5706246944559047, + "learning_rate": 0.003, + "loss": 4.1532, + "step": 3857 + }, + { + "epoch": 0.03858, + "grad_norm": 0.5178970385116678, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 3858 + }, + { + "epoch": 0.03859, + "grad_norm": 0.6163137266901324, + "learning_rate": 0.003, + "loss": 4.1717, + "step": 3859 + }, + { + "epoch": 0.0386, + "grad_norm": 0.5584392387461609, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 3860 + }, + { + "epoch": 0.03861, + "grad_norm": 0.5881922942664491, + "learning_rate": 0.003, + "loss": 4.174, + "step": 3861 + }, + { + "epoch": 0.03862, + "grad_norm": 0.6070887580362981, + "learning_rate": 0.003, + "loss": 4.1499, + "step": 3862 + }, + { + "epoch": 0.03863, + "grad_norm": 0.5412974380781377, + "learning_rate": 0.003, + "loss": 4.1431, + "step": 3863 + }, + { + "epoch": 0.03864, + "grad_norm": 0.558239316328612, + "learning_rate": 0.003, + "loss": 4.1489, + "step": 3864 + }, + { + "epoch": 0.03865, + "grad_norm": 0.5920559813538258, + "learning_rate": 0.003, + "loss": 4.1625, + "step": 3865 + }, + { + "epoch": 0.03866, + "grad_norm": 0.65782610303018, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 3866 + }, + { + "epoch": 0.03867, + "grad_norm": 0.8001945832132011, + "learning_rate": 0.003, + "loss": 4.1339, + "step": 3867 + }, + { + "epoch": 0.03868, + "grad_norm": 0.8093398470542099, + "learning_rate": 0.003, + "loss": 4.1732, + "step": 3868 + }, + { + "epoch": 0.03869, + "grad_norm": 0.8272905784155873, + "learning_rate": 0.003, + "loss": 4.1631, + "step": 3869 + }, + { + "epoch": 0.0387, + "grad_norm": 0.7805492019719857, + "learning_rate": 0.003, + "loss": 4.1745, + "step": 3870 + }, + { + "epoch": 0.03871, + "grad_norm": 0.6463277445729251, + "learning_rate": 0.003, + "loss": 4.1432, + "step": 3871 + }, + { + "epoch": 0.03872, + "grad_norm": 0.7130625269699555, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 3872 + }, + { + "epoch": 0.03873, + "grad_norm": 0.7604226657571129, + "learning_rate": 0.003, + "loss": 4.1892, + "step": 3873 + }, + { + "epoch": 0.03874, + "grad_norm": 0.8264003715319567, + "learning_rate": 0.003, + "loss": 4.1659, + "step": 3874 + }, + { + "epoch": 0.03875, + "grad_norm": 0.8157133390523014, + "learning_rate": 0.003, + "loss": 4.1706, + "step": 3875 + }, + { + "epoch": 0.03876, + "grad_norm": 0.6633759602344576, + "learning_rate": 0.003, + "loss": 4.1981, + "step": 3876 + }, + { + "epoch": 0.03877, + "grad_norm": 0.5945213411893223, + "learning_rate": 0.003, + "loss": 4.1757, + "step": 3877 + }, + { + "epoch": 0.03878, + "grad_norm": 0.6057373256262768, + "learning_rate": 0.003, + "loss": 4.1882, + "step": 3878 + }, + { + "epoch": 0.03879, + "grad_norm": 0.632481493308064, + "learning_rate": 0.003, + "loss": 4.1887, + "step": 3879 + }, + { + "epoch": 0.0388, + "grad_norm": 0.5936273104909171, + "learning_rate": 0.003, + "loss": 4.1491, + "step": 3880 + }, + { + "epoch": 0.03881, + "grad_norm": 0.5078096495379679, + "learning_rate": 0.003, + "loss": 4.1746, + "step": 3881 + }, + { + "epoch": 0.03882, + "grad_norm": 0.47981556545878357, + "learning_rate": 0.003, + "loss": 4.1795, + "step": 3882 + }, + { + "epoch": 0.03883, + "grad_norm": 0.5081271003225738, + "learning_rate": 0.003, + "loss": 4.1497, + "step": 3883 + }, + { + "epoch": 0.03884, + "grad_norm": 0.5324630916346529, + "learning_rate": 0.003, + "loss": 4.1596, + "step": 3884 + }, + { + "epoch": 0.03885, + "grad_norm": 0.5770047266124201, + "learning_rate": 0.003, + "loss": 4.1786, + "step": 3885 + }, + { + "epoch": 0.03886, + "grad_norm": 0.5823626216641113, + "learning_rate": 0.003, + "loss": 4.1528, + "step": 3886 + }, + { + "epoch": 0.03887, + "grad_norm": 0.5323044500683095, + "learning_rate": 0.003, + "loss": 4.1628, + "step": 3887 + }, + { + "epoch": 0.03888, + "grad_norm": 0.6470253491308557, + "learning_rate": 0.003, + "loss": 4.1434, + "step": 3888 + }, + { + "epoch": 0.03889, + "grad_norm": 0.7016901913145263, + "learning_rate": 0.003, + "loss": 4.1667, + "step": 3889 + }, + { + "epoch": 0.0389, + "grad_norm": 0.6102524549297166, + "learning_rate": 0.003, + "loss": 4.1498, + "step": 3890 + }, + { + "epoch": 0.03891, + "grad_norm": 0.5529440776190121, + "learning_rate": 0.003, + "loss": 4.1557, + "step": 3891 + }, + { + "epoch": 0.03892, + "grad_norm": 0.5916779639985591, + "learning_rate": 0.003, + "loss": 4.172, + "step": 3892 + }, + { + "epoch": 0.03893, + "grad_norm": 0.6828403164964358, + "learning_rate": 0.003, + "loss": 4.1486, + "step": 3893 + }, + { + "epoch": 0.03894, + "grad_norm": 0.6193514242120052, + "learning_rate": 0.003, + "loss": 4.1601, + "step": 3894 + }, + { + "epoch": 0.03895, + "grad_norm": 0.5599659326192736, + "learning_rate": 0.003, + "loss": 4.1314, + "step": 3895 + }, + { + "epoch": 0.03896, + "grad_norm": 0.48762045043395263, + "learning_rate": 0.003, + "loss": 4.1557, + "step": 3896 + }, + { + "epoch": 0.03897, + "grad_norm": 0.5042861977924956, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 3897 + }, + { + "epoch": 0.03898, + "grad_norm": 0.487281141563187, + "learning_rate": 0.003, + "loss": 4.1596, + "step": 3898 + }, + { + "epoch": 0.03899, + "grad_norm": 0.5187455846326631, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 3899 + }, + { + "epoch": 0.039, + "grad_norm": 0.4540397798848912, + "learning_rate": 0.003, + "loss": 4.1505, + "step": 3900 + }, + { + "epoch": 0.03901, + "grad_norm": 0.4735936186553674, + "learning_rate": 0.003, + "loss": 4.1619, + "step": 3901 + }, + { + "epoch": 0.03902, + "grad_norm": 0.45921775589104796, + "learning_rate": 0.003, + "loss": 4.1329, + "step": 3902 + }, + { + "epoch": 0.03903, + "grad_norm": 0.37493975839614513, + "learning_rate": 0.003, + "loss": 4.168, + "step": 3903 + }, + { + "epoch": 0.03904, + "grad_norm": 0.3462652687764147, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 3904 + }, + { + "epoch": 0.03905, + "grad_norm": 0.35156300254567974, + "learning_rate": 0.003, + "loss": 4.1648, + "step": 3905 + }, + { + "epoch": 0.03906, + "grad_norm": 0.3615192109464765, + "learning_rate": 0.003, + "loss": 4.1355, + "step": 3906 + }, + { + "epoch": 0.03907, + "grad_norm": 0.4018823550496774, + "learning_rate": 0.003, + "loss": 4.1489, + "step": 3907 + }, + { + "epoch": 0.03908, + "grad_norm": 0.5764826701055724, + "learning_rate": 0.003, + "loss": 4.1338, + "step": 3908 + }, + { + "epoch": 0.03909, + "grad_norm": 0.7886220228031054, + "learning_rate": 0.003, + "loss": 4.1504, + "step": 3909 + }, + { + "epoch": 0.0391, + "grad_norm": 0.8778590965436065, + "learning_rate": 0.003, + "loss": 4.1885, + "step": 3910 + }, + { + "epoch": 0.03911, + "grad_norm": 0.7707501748681591, + "learning_rate": 0.003, + "loss": 4.1709, + "step": 3911 + }, + { + "epoch": 0.03912, + "grad_norm": 0.7237965058473473, + "learning_rate": 0.003, + "loss": 4.1699, + "step": 3912 + }, + { + "epoch": 0.03913, + "grad_norm": 0.7684494783386372, + "learning_rate": 0.003, + "loss": 4.1451, + "step": 3913 + }, + { + "epoch": 0.03914, + "grad_norm": 0.6559500648333249, + "learning_rate": 0.003, + "loss": 4.1776, + "step": 3914 + }, + { + "epoch": 0.03915, + "grad_norm": 0.6438695734989948, + "learning_rate": 0.003, + "loss": 4.149, + "step": 3915 + }, + { + "epoch": 0.03916, + "grad_norm": 0.710001018742664, + "learning_rate": 0.003, + "loss": 4.1781, + "step": 3916 + }, + { + "epoch": 0.03917, + "grad_norm": 0.7219014702450186, + "learning_rate": 0.003, + "loss": 4.1693, + "step": 3917 + }, + { + "epoch": 0.03918, + "grad_norm": 0.6133683335088463, + "learning_rate": 0.003, + "loss": 4.156, + "step": 3918 + }, + { + "epoch": 0.03919, + "grad_norm": 0.4840575228811507, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 3919 + }, + { + "epoch": 0.0392, + "grad_norm": 0.4695659280359604, + "learning_rate": 0.003, + "loss": 4.1433, + "step": 3920 + }, + { + "epoch": 0.03921, + "grad_norm": 0.4597028009194431, + "learning_rate": 0.003, + "loss": 4.1369, + "step": 3921 + }, + { + "epoch": 0.03922, + "grad_norm": 0.4021059704956593, + "learning_rate": 0.003, + "loss": 4.1754, + "step": 3922 + }, + { + "epoch": 0.03923, + "grad_norm": 0.45811748316459755, + "learning_rate": 0.003, + "loss": 4.1299, + "step": 3923 + }, + { + "epoch": 0.03924, + "grad_norm": 0.4786080372228159, + "learning_rate": 0.003, + "loss": 4.1388, + "step": 3924 + }, + { + "epoch": 0.03925, + "grad_norm": 0.5049165752143754, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 3925 + }, + { + "epoch": 0.03926, + "grad_norm": 0.5163104662428779, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 3926 + }, + { + "epoch": 0.03927, + "grad_norm": 0.5099171230784774, + "learning_rate": 0.003, + "loss": 4.1492, + "step": 3927 + }, + { + "epoch": 0.03928, + "grad_norm": 0.572288419562524, + "learning_rate": 0.003, + "loss": 4.168, + "step": 3928 + }, + { + "epoch": 0.03929, + "grad_norm": 0.6901560668624227, + "learning_rate": 0.003, + "loss": 4.1812, + "step": 3929 + }, + { + "epoch": 0.0393, + "grad_norm": 0.6037962837223242, + "learning_rate": 0.003, + "loss": 4.1643, + "step": 3930 + }, + { + "epoch": 0.03931, + "grad_norm": 0.5251957250436882, + "learning_rate": 0.003, + "loss": 4.1638, + "step": 3931 + }, + { + "epoch": 0.03932, + "grad_norm": 0.538831165664605, + "learning_rate": 0.003, + "loss": 4.1652, + "step": 3932 + }, + { + "epoch": 0.03933, + "grad_norm": 0.5209838890496463, + "learning_rate": 0.003, + "loss": 4.1515, + "step": 3933 + }, + { + "epoch": 0.03934, + "grad_norm": 0.566519679434704, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 3934 + }, + { + "epoch": 0.03935, + "grad_norm": 0.646396237695394, + "learning_rate": 0.003, + "loss": 4.1458, + "step": 3935 + }, + { + "epoch": 0.03936, + "grad_norm": 0.7750930535443175, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 3936 + }, + { + "epoch": 0.03937, + "grad_norm": 0.9869397924658896, + "learning_rate": 0.003, + "loss": 4.168, + "step": 3937 + }, + { + "epoch": 0.03938, + "grad_norm": 1.0277905519905985, + "learning_rate": 0.003, + "loss": 4.1836, + "step": 3938 + }, + { + "epoch": 0.03939, + "grad_norm": 0.7852266987902085, + "learning_rate": 0.003, + "loss": 4.1521, + "step": 3939 + }, + { + "epoch": 0.0394, + "grad_norm": 0.8232677591358063, + "learning_rate": 0.003, + "loss": 4.1891, + "step": 3940 + }, + { + "epoch": 0.03941, + "grad_norm": 0.7554935915430424, + "learning_rate": 0.003, + "loss": 4.1765, + "step": 3941 + }, + { + "epoch": 0.03942, + "grad_norm": 0.5714665309206746, + "learning_rate": 0.003, + "loss": 4.1794, + "step": 3942 + }, + { + "epoch": 0.03943, + "grad_norm": 0.5845658325896066, + "learning_rate": 0.003, + "loss": 4.1718, + "step": 3943 + }, + { + "epoch": 0.03944, + "grad_norm": 0.6026107955836698, + "learning_rate": 0.003, + "loss": 4.1867, + "step": 3944 + }, + { + "epoch": 0.03945, + "grad_norm": 0.5468006864617043, + "learning_rate": 0.003, + "loss": 4.1546, + "step": 3945 + }, + { + "epoch": 0.03946, + "grad_norm": 0.5298095903489612, + "learning_rate": 0.003, + "loss": 4.1779, + "step": 3946 + }, + { + "epoch": 0.03947, + "grad_norm": 0.5118480688773221, + "learning_rate": 0.003, + "loss": 4.1795, + "step": 3947 + }, + { + "epoch": 0.03948, + "grad_norm": 0.5415983011837967, + "learning_rate": 0.003, + "loss": 4.1971, + "step": 3948 + }, + { + "epoch": 0.03949, + "grad_norm": 0.6476511273829448, + "learning_rate": 0.003, + "loss": 4.1406, + "step": 3949 + }, + { + "epoch": 0.0395, + "grad_norm": 0.5921282759053738, + "learning_rate": 0.003, + "loss": 4.1694, + "step": 3950 + }, + { + "epoch": 0.03951, + "grad_norm": 0.4873072132844621, + "learning_rate": 0.003, + "loss": 4.1943, + "step": 3951 + }, + { + "epoch": 0.03952, + "grad_norm": 0.47680587697404614, + "learning_rate": 0.003, + "loss": 4.1735, + "step": 3952 + }, + { + "epoch": 0.03953, + "grad_norm": 0.4863348841149029, + "learning_rate": 0.003, + "loss": 4.1495, + "step": 3953 + }, + { + "epoch": 0.03954, + "grad_norm": 0.4976353402048533, + "learning_rate": 0.003, + "loss": 4.1532, + "step": 3954 + }, + { + "epoch": 0.03955, + "grad_norm": 0.5951063481718434, + "learning_rate": 0.003, + "loss": 4.1575, + "step": 3955 + }, + { + "epoch": 0.03956, + "grad_norm": 0.6359841084362824, + "learning_rate": 0.003, + "loss": 4.1616, + "step": 3956 + }, + { + "epoch": 0.03957, + "grad_norm": 0.5941354341152176, + "learning_rate": 0.003, + "loss": 4.1652, + "step": 3957 + }, + { + "epoch": 0.03958, + "grad_norm": 0.6262454009808719, + "learning_rate": 0.003, + "loss": 4.1397, + "step": 3958 + }, + { + "epoch": 0.03959, + "grad_norm": 0.7154633677223874, + "learning_rate": 0.003, + "loss": 4.1455, + "step": 3959 + }, + { + "epoch": 0.0396, + "grad_norm": 0.7081920422281349, + "learning_rate": 0.003, + "loss": 4.165, + "step": 3960 + }, + { + "epoch": 0.03961, + "grad_norm": 0.7397573168118693, + "learning_rate": 0.003, + "loss": 4.1626, + "step": 3961 + }, + { + "epoch": 0.03962, + "grad_norm": 0.711337260874705, + "learning_rate": 0.003, + "loss": 4.1737, + "step": 3962 + }, + { + "epoch": 0.03963, + "grad_norm": 0.7123884385146033, + "learning_rate": 0.003, + "loss": 4.1626, + "step": 3963 + }, + { + "epoch": 0.03964, + "grad_norm": 0.6152211153875224, + "learning_rate": 0.003, + "loss": 4.1597, + "step": 3964 + }, + { + "epoch": 0.03965, + "grad_norm": 0.591222143630805, + "learning_rate": 0.003, + "loss": 4.1579, + "step": 3965 + }, + { + "epoch": 0.03966, + "grad_norm": 0.5392802901033877, + "learning_rate": 0.003, + "loss": 4.1935, + "step": 3966 + }, + { + "epoch": 0.03967, + "grad_norm": 0.524610568064759, + "learning_rate": 0.003, + "loss": 4.1481, + "step": 3967 + }, + { + "epoch": 0.03968, + "grad_norm": 0.5121602181555871, + "learning_rate": 0.003, + "loss": 4.1754, + "step": 3968 + }, + { + "epoch": 0.03969, + "grad_norm": 0.5008822737603466, + "learning_rate": 0.003, + "loss": 4.1543, + "step": 3969 + }, + { + "epoch": 0.0397, + "grad_norm": 0.5150356493385037, + "learning_rate": 0.003, + "loss": 4.1583, + "step": 3970 + }, + { + "epoch": 0.03971, + "grad_norm": 0.5181126377410241, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 3971 + }, + { + "epoch": 0.03972, + "grad_norm": 0.4952868090920291, + "learning_rate": 0.003, + "loss": 4.1549, + "step": 3972 + }, + { + "epoch": 0.03973, + "grad_norm": 0.49146340749665485, + "learning_rate": 0.003, + "loss": 4.1581, + "step": 3973 + }, + { + "epoch": 0.03974, + "grad_norm": 0.6037509933252784, + "learning_rate": 0.003, + "loss": 4.1574, + "step": 3974 + }, + { + "epoch": 0.03975, + "grad_norm": 0.7194350601933044, + "learning_rate": 0.003, + "loss": 4.152, + "step": 3975 + }, + { + "epoch": 0.03976, + "grad_norm": 0.7807821793201645, + "learning_rate": 0.003, + "loss": 4.189, + "step": 3976 + }, + { + "epoch": 0.03977, + "grad_norm": 0.7824814786642659, + "learning_rate": 0.003, + "loss": 4.18, + "step": 3977 + }, + { + "epoch": 0.03978, + "grad_norm": 0.7382196107400837, + "learning_rate": 0.003, + "loss": 4.145, + "step": 3978 + }, + { + "epoch": 0.03979, + "grad_norm": 0.7187722022394311, + "learning_rate": 0.003, + "loss": 4.1683, + "step": 3979 + }, + { + "epoch": 0.0398, + "grad_norm": 0.7054934205273465, + "learning_rate": 0.003, + "loss": 4.1617, + "step": 3980 + }, + { + "epoch": 0.03981, + "grad_norm": 0.7032054871032929, + "learning_rate": 0.003, + "loss": 4.1547, + "step": 3981 + }, + { + "epoch": 0.03982, + "grad_norm": 0.5823608521179618, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 3982 + }, + { + "epoch": 0.03983, + "grad_norm": 0.5786298940783224, + "learning_rate": 0.003, + "loss": 4.1554, + "step": 3983 + }, + { + "epoch": 0.03984, + "grad_norm": 0.6246079403898089, + "learning_rate": 0.003, + "loss": 4.166, + "step": 3984 + }, + { + "epoch": 0.03985, + "grad_norm": 0.6818628523024235, + "learning_rate": 0.003, + "loss": 4.1623, + "step": 3985 + }, + { + "epoch": 0.03986, + "grad_norm": 0.6535731478840459, + "learning_rate": 0.003, + "loss": 4.1629, + "step": 3986 + }, + { + "epoch": 0.03987, + "grad_norm": 0.6040151659435402, + "learning_rate": 0.003, + "loss": 4.1639, + "step": 3987 + }, + { + "epoch": 0.03988, + "grad_norm": 0.6453741519043461, + "learning_rate": 0.003, + "loss": 4.1707, + "step": 3988 + }, + { + "epoch": 0.03989, + "grad_norm": 0.764312719470282, + "learning_rate": 0.003, + "loss": 4.1601, + "step": 3989 + }, + { + "epoch": 0.0399, + "grad_norm": 0.8495293415117621, + "learning_rate": 0.003, + "loss": 4.1394, + "step": 3990 + }, + { + "epoch": 0.03991, + "grad_norm": 0.7402933337405657, + "learning_rate": 0.003, + "loss": 4.183, + "step": 3991 + }, + { + "epoch": 0.03992, + "grad_norm": 0.5804226786236383, + "learning_rate": 0.003, + "loss": 4.1586, + "step": 3992 + }, + { + "epoch": 0.03993, + "grad_norm": 0.586169093303552, + "learning_rate": 0.003, + "loss": 4.1558, + "step": 3993 + }, + { + "epoch": 0.03994, + "grad_norm": 0.6041046229351503, + "learning_rate": 0.003, + "loss": 4.1516, + "step": 3994 + }, + { + "epoch": 0.03995, + "grad_norm": 0.6875437069134418, + "learning_rate": 0.003, + "loss": 4.1441, + "step": 3995 + }, + { + "epoch": 0.03996, + "grad_norm": 0.5847418717553203, + "learning_rate": 0.003, + "loss": 4.1722, + "step": 3996 + }, + { + "epoch": 0.03997, + "grad_norm": 0.5323079012210317, + "learning_rate": 0.003, + "loss": 4.1722, + "step": 3997 + }, + { + "epoch": 0.03998, + "grad_norm": 0.4356783084468044, + "learning_rate": 0.003, + "loss": 4.1713, + "step": 3998 + }, + { + "epoch": 0.03999, + "grad_norm": 0.3763086485452268, + "learning_rate": 0.003, + "loss": 4.1692, + "step": 3999 + }, + { + "epoch": 0.04, + "grad_norm": 0.37795109500013657, + "learning_rate": 0.003, + "loss": 4.1715, + "step": 4000 + }, + { + "epoch": 0.04001, + "grad_norm": 0.3830959820048188, + "learning_rate": 0.003, + "loss": 4.167, + "step": 4001 + }, + { + "epoch": 0.04002, + "grad_norm": 0.41146487178002505, + "learning_rate": 0.003, + "loss": 4.1388, + "step": 4002 + }, + { + "epoch": 0.04003, + "grad_norm": 0.4313910001981467, + "learning_rate": 0.003, + "loss": 4.1612, + "step": 4003 + }, + { + "epoch": 0.04004, + "grad_norm": 0.4459746247604169, + "learning_rate": 0.003, + "loss": 4.1664, + "step": 4004 + }, + { + "epoch": 0.04005, + "grad_norm": 0.47718734644902294, + "learning_rate": 0.003, + "loss": 4.1732, + "step": 4005 + }, + { + "epoch": 0.04006, + "grad_norm": 0.4814752963605302, + "learning_rate": 0.003, + "loss": 4.1303, + "step": 4006 + }, + { + "epoch": 0.04007, + "grad_norm": 0.5099778221457298, + "learning_rate": 0.003, + "loss": 4.1582, + "step": 4007 + }, + { + "epoch": 0.04008, + "grad_norm": 0.5043675363902501, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 4008 + }, + { + "epoch": 0.04009, + "grad_norm": 0.5439841205133068, + "learning_rate": 0.003, + "loss": 4.1338, + "step": 4009 + }, + { + "epoch": 0.0401, + "grad_norm": 0.6497881885617821, + "learning_rate": 0.003, + "loss": 4.155, + "step": 4010 + }, + { + "epoch": 0.04011, + "grad_norm": 0.8245370470535895, + "learning_rate": 0.003, + "loss": 4.1694, + "step": 4011 + }, + { + "epoch": 0.04012, + "grad_norm": 0.9674009929255976, + "learning_rate": 0.003, + "loss": 4.1789, + "step": 4012 + }, + { + "epoch": 0.04013, + "grad_norm": 0.7353158349829942, + "learning_rate": 0.003, + "loss": 4.1699, + "step": 4013 + }, + { + "epoch": 0.04014, + "grad_norm": 0.7081307887263109, + "learning_rate": 0.003, + "loss": 4.1588, + "step": 4014 + }, + { + "epoch": 0.04015, + "grad_norm": 0.7515395751430206, + "learning_rate": 0.003, + "loss": 4.1398, + "step": 4015 + }, + { + "epoch": 0.04016, + "grad_norm": 0.7357741460180915, + "learning_rate": 0.003, + "loss": 4.1563, + "step": 4016 + }, + { + "epoch": 0.04017, + "grad_norm": 0.6881306027184227, + "learning_rate": 0.003, + "loss": 4.2004, + "step": 4017 + }, + { + "epoch": 0.04018, + "grad_norm": 0.6328656616117202, + "learning_rate": 0.003, + "loss": 4.1663, + "step": 4018 + }, + { + "epoch": 0.04019, + "grad_norm": 0.672111331766413, + "learning_rate": 0.003, + "loss": 4.147, + "step": 4019 + }, + { + "epoch": 0.0402, + "grad_norm": 0.6935269300068305, + "learning_rate": 0.003, + "loss": 4.1825, + "step": 4020 + }, + { + "epoch": 0.04021, + "grad_norm": 0.6368533158642944, + "learning_rate": 0.003, + "loss": 4.1532, + "step": 4021 + }, + { + "epoch": 0.04022, + "grad_norm": 0.7051716536344202, + "learning_rate": 0.003, + "loss": 4.1558, + "step": 4022 + }, + { + "epoch": 0.04023, + "grad_norm": 0.6469926590464656, + "learning_rate": 0.003, + "loss": 4.1549, + "step": 4023 + }, + { + "epoch": 0.04024, + "grad_norm": 0.6694771111384129, + "learning_rate": 0.003, + "loss": 4.1478, + "step": 4024 + }, + { + "epoch": 0.04025, + "grad_norm": 0.6341741755537447, + "learning_rate": 0.003, + "loss": 4.1513, + "step": 4025 + }, + { + "epoch": 0.04026, + "grad_norm": 0.6640885632809859, + "learning_rate": 0.003, + "loss": 4.1767, + "step": 4026 + }, + { + "epoch": 0.04027, + "grad_norm": 0.6089194432075719, + "learning_rate": 0.003, + "loss": 4.1801, + "step": 4027 + }, + { + "epoch": 0.04028, + "grad_norm": 0.6436111853565991, + "learning_rate": 0.003, + "loss": 4.1589, + "step": 4028 + }, + { + "epoch": 0.04029, + "grad_norm": 0.601196906060423, + "learning_rate": 0.003, + "loss": 4.1851, + "step": 4029 + }, + { + "epoch": 0.0403, + "grad_norm": 0.6008464331523574, + "learning_rate": 0.003, + "loss": 4.1371, + "step": 4030 + }, + { + "epoch": 0.04031, + "grad_norm": 0.5735266100756939, + "learning_rate": 0.003, + "loss": 4.1527, + "step": 4031 + }, + { + "epoch": 0.04032, + "grad_norm": 0.5248413725968446, + "learning_rate": 0.003, + "loss": 4.1768, + "step": 4032 + }, + { + "epoch": 0.04033, + "grad_norm": 0.49046370370450504, + "learning_rate": 0.003, + "loss": 4.1695, + "step": 4033 + }, + { + "epoch": 0.04034, + "grad_norm": 0.5090112067595912, + "learning_rate": 0.003, + "loss": 4.136, + "step": 4034 + }, + { + "epoch": 0.04035, + "grad_norm": 0.4649874752654264, + "learning_rate": 0.003, + "loss": 4.1578, + "step": 4035 + }, + { + "epoch": 0.04036, + "grad_norm": 0.4897958199323308, + "learning_rate": 0.003, + "loss": 4.1724, + "step": 4036 + }, + { + "epoch": 0.04037, + "grad_norm": 0.6044670370792896, + "learning_rate": 0.003, + "loss": 4.1234, + "step": 4037 + }, + { + "epoch": 0.04038, + "grad_norm": 0.8335470507296103, + "learning_rate": 0.003, + "loss": 4.1519, + "step": 4038 + }, + { + "epoch": 0.04039, + "grad_norm": 0.8645850468976461, + "learning_rate": 0.003, + "loss": 4.1591, + "step": 4039 + }, + { + "epoch": 0.0404, + "grad_norm": 0.7530842619650773, + "learning_rate": 0.003, + "loss": 4.1525, + "step": 4040 + }, + { + "epoch": 0.04041, + "grad_norm": 0.5842132443480766, + "learning_rate": 0.003, + "loss": 4.1457, + "step": 4041 + }, + { + "epoch": 0.04042, + "grad_norm": 0.5354331761654865, + "learning_rate": 0.003, + "loss": 4.1835, + "step": 4042 + }, + { + "epoch": 0.04043, + "grad_norm": 0.5185871828066747, + "learning_rate": 0.003, + "loss": 4.1689, + "step": 4043 + }, + { + "epoch": 0.04044, + "grad_norm": 0.5097132412370223, + "learning_rate": 0.003, + "loss": 4.162, + "step": 4044 + }, + { + "epoch": 0.04045, + "grad_norm": 0.553775594849916, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 4045 + }, + { + "epoch": 0.04046, + "grad_norm": 0.49141462695183785, + "learning_rate": 0.003, + "loss": 4.1533, + "step": 4046 + }, + { + "epoch": 0.04047, + "grad_norm": 0.41512219304607756, + "learning_rate": 0.003, + "loss": 4.1427, + "step": 4047 + }, + { + "epoch": 0.04048, + "grad_norm": 0.4225143586443279, + "learning_rate": 0.003, + "loss": 4.1344, + "step": 4048 + }, + { + "epoch": 0.04049, + "grad_norm": 0.3984988970584974, + "learning_rate": 0.003, + "loss": 4.1735, + "step": 4049 + }, + { + "epoch": 0.0405, + "grad_norm": 0.3749419268913753, + "learning_rate": 0.003, + "loss": 4.1684, + "step": 4050 + }, + { + "epoch": 0.04051, + "grad_norm": 0.39476946001794994, + "learning_rate": 0.003, + "loss": 4.147, + "step": 4051 + }, + { + "epoch": 0.04052, + "grad_norm": 0.4554884596116926, + "learning_rate": 0.003, + "loss": 4.1554, + "step": 4052 + }, + { + "epoch": 0.04053, + "grad_norm": 0.6242383494718413, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 4053 + }, + { + "epoch": 0.04054, + "grad_norm": 0.7860677992363685, + "learning_rate": 0.003, + "loss": 4.1382, + "step": 4054 + }, + { + "epoch": 0.04055, + "grad_norm": 0.8126593846409768, + "learning_rate": 0.003, + "loss": 4.137, + "step": 4055 + }, + { + "epoch": 0.04056, + "grad_norm": 0.7391710444931487, + "learning_rate": 0.003, + "loss": 4.1597, + "step": 4056 + }, + { + "epoch": 0.04057, + "grad_norm": 0.7425519640955306, + "learning_rate": 0.003, + "loss": 4.1623, + "step": 4057 + }, + { + "epoch": 0.04058, + "grad_norm": 0.7750101755361524, + "learning_rate": 0.003, + "loss": 4.1387, + "step": 4058 + }, + { + "epoch": 0.04059, + "grad_norm": 0.6970458468533857, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 4059 + }, + { + "epoch": 0.0406, + "grad_norm": 0.6081117396919562, + "learning_rate": 0.003, + "loss": 4.1775, + "step": 4060 + }, + { + "epoch": 0.04061, + "grad_norm": 0.539360737374925, + "learning_rate": 0.003, + "loss": 4.1598, + "step": 4061 + }, + { + "epoch": 0.04062, + "grad_norm": 0.5166363975835321, + "learning_rate": 0.003, + "loss": 4.1383, + "step": 4062 + }, + { + "epoch": 0.04063, + "grad_norm": 0.49611935522557493, + "learning_rate": 0.003, + "loss": 4.1435, + "step": 4063 + }, + { + "epoch": 0.04064, + "grad_norm": 0.41285257049195684, + "learning_rate": 0.003, + "loss": 4.1525, + "step": 4064 + }, + { + "epoch": 0.04065, + "grad_norm": 0.4763881231604338, + "learning_rate": 0.003, + "loss": 4.193, + "step": 4065 + }, + { + "epoch": 0.04066, + "grad_norm": 0.4741067038687965, + "learning_rate": 0.003, + "loss": 4.15, + "step": 4066 + }, + { + "epoch": 0.04067, + "grad_norm": 0.4426062959642298, + "learning_rate": 0.003, + "loss": 4.1514, + "step": 4067 + }, + { + "epoch": 0.04068, + "grad_norm": 0.42915467353070186, + "learning_rate": 0.003, + "loss": 4.1582, + "step": 4068 + }, + { + "epoch": 0.04069, + "grad_norm": 0.45412191308027944, + "learning_rate": 0.003, + "loss": 4.1487, + "step": 4069 + }, + { + "epoch": 0.0407, + "grad_norm": 0.4671086228860357, + "learning_rate": 0.003, + "loss": 4.1509, + "step": 4070 + }, + { + "epoch": 0.04071, + "grad_norm": 0.5692230139112596, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 4071 + }, + { + "epoch": 0.04072, + "grad_norm": 0.7764468905836547, + "learning_rate": 0.003, + "loss": 4.1518, + "step": 4072 + }, + { + "epoch": 0.04073, + "grad_norm": 0.8448322221505851, + "learning_rate": 0.003, + "loss": 4.1421, + "step": 4073 + }, + { + "epoch": 0.04074, + "grad_norm": 0.856020875384491, + "learning_rate": 0.003, + "loss": 4.1745, + "step": 4074 + }, + { + "epoch": 0.04075, + "grad_norm": 0.9287167181671592, + "learning_rate": 0.003, + "loss": 4.1779, + "step": 4075 + }, + { + "epoch": 0.04076, + "grad_norm": 0.873329112405605, + "learning_rate": 0.003, + "loss": 4.1789, + "step": 4076 + }, + { + "epoch": 0.04077, + "grad_norm": 0.7601580397215363, + "learning_rate": 0.003, + "loss": 4.1881, + "step": 4077 + }, + { + "epoch": 0.04078, + "grad_norm": 0.8042586607784782, + "learning_rate": 0.003, + "loss": 4.1696, + "step": 4078 + }, + { + "epoch": 0.04079, + "grad_norm": 0.8176950474454038, + "learning_rate": 0.003, + "loss": 4.1771, + "step": 4079 + }, + { + "epoch": 0.0408, + "grad_norm": 0.9030758743289882, + "learning_rate": 0.003, + "loss": 4.1811, + "step": 4080 + }, + { + "epoch": 0.04081, + "grad_norm": 0.7986927315562776, + "learning_rate": 0.003, + "loss": 4.1573, + "step": 4081 + }, + { + "epoch": 0.04082, + "grad_norm": 0.7456348575122663, + "learning_rate": 0.003, + "loss": 4.1933, + "step": 4082 + }, + { + "epoch": 0.04083, + "grad_norm": 0.7464945818151806, + "learning_rate": 0.003, + "loss": 4.1693, + "step": 4083 + }, + { + "epoch": 0.04084, + "grad_norm": 0.5910864081536713, + "learning_rate": 0.003, + "loss": 4.2007, + "step": 4084 + }, + { + "epoch": 0.04085, + "grad_norm": 0.6374107110244114, + "learning_rate": 0.003, + "loss": 4.1813, + "step": 4085 + }, + { + "epoch": 0.04086, + "grad_norm": 0.644544973240151, + "learning_rate": 0.003, + "loss": 4.2032, + "step": 4086 + }, + { + "epoch": 0.04087, + "grad_norm": 0.6474810812977669, + "learning_rate": 0.003, + "loss": 4.1827, + "step": 4087 + }, + { + "epoch": 0.04088, + "grad_norm": 0.48757799745230485, + "learning_rate": 0.003, + "loss": 4.1563, + "step": 4088 + }, + { + "epoch": 0.04089, + "grad_norm": 0.5082604112884798, + "learning_rate": 0.003, + "loss": 4.1548, + "step": 4089 + }, + { + "epoch": 0.0409, + "grad_norm": 0.4251634976124169, + "learning_rate": 0.003, + "loss": 4.1671, + "step": 4090 + }, + { + "epoch": 0.04091, + "grad_norm": 0.40607507200162474, + "learning_rate": 0.003, + "loss": 4.1758, + "step": 4091 + }, + { + "epoch": 0.04092, + "grad_norm": 0.4118371741461281, + "learning_rate": 0.003, + "loss": 4.1314, + "step": 4092 + }, + { + "epoch": 0.04093, + "grad_norm": 0.379924610879027, + "learning_rate": 0.003, + "loss": 4.1538, + "step": 4093 + }, + { + "epoch": 0.04094, + "grad_norm": 0.43768923530899545, + "learning_rate": 0.003, + "loss": 4.1461, + "step": 4094 + }, + { + "epoch": 0.04095, + "grad_norm": 0.411023776294053, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 4095 + }, + { + "epoch": 0.04096, + "grad_norm": 0.45768098717184347, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 4096 + }, + { + "epoch": 0.04097, + "grad_norm": 0.557314037287976, + "learning_rate": 0.003, + "loss": 4.1461, + "step": 4097 + }, + { + "epoch": 0.04098, + "grad_norm": 0.6699585119473419, + "learning_rate": 0.003, + "loss": 4.1559, + "step": 4098 + }, + { + "epoch": 0.04099, + "grad_norm": 0.6624628981660053, + "learning_rate": 0.003, + "loss": 4.1765, + "step": 4099 + }, + { + "epoch": 0.041, + "grad_norm": 0.4938757392791384, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 4100 + }, + { + "epoch": 0.04101, + "grad_norm": 0.41979137569186115, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 4101 + }, + { + "epoch": 0.04102, + "grad_norm": 0.49582634443681156, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 4102 + }, + { + "epoch": 0.04103, + "grad_norm": 0.46586436339945675, + "learning_rate": 0.003, + "loss": 4.166, + "step": 4103 + }, + { + "epoch": 0.04104, + "grad_norm": 0.45341411443238716, + "learning_rate": 0.003, + "loss": 4.1493, + "step": 4104 + }, + { + "epoch": 0.04105, + "grad_norm": 0.39192920748405663, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 4105 + }, + { + "epoch": 0.04106, + "grad_norm": 0.40403661174930955, + "learning_rate": 0.003, + "loss": 4.1493, + "step": 4106 + }, + { + "epoch": 0.04107, + "grad_norm": 0.46259866064643457, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 4107 + }, + { + "epoch": 0.04108, + "grad_norm": 0.5224958035249453, + "learning_rate": 0.003, + "loss": 4.1331, + "step": 4108 + }, + { + "epoch": 0.04109, + "grad_norm": 0.5925525541895633, + "learning_rate": 0.003, + "loss": 4.1254, + "step": 4109 + }, + { + "epoch": 0.0411, + "grad_norm": 0.6870748923239856, + "learning_rate": 0.003, + "loss": 4.142, + "step": 4110 + }, + { + "epoch": 0.04111, + "grad_norm": 0.6433679126097386, + "learning_rate": 0.003, + "loss": 4.1265, + "step": 4111 + }, + { + "epoch": 0.04112, + "grad_norm": 0.5391529554622597, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 4112 + }, + { + "epoch": 0.04113, + "grad_norm": 0.5453895793045661, + "learning_rate": 0.003, + "loss": 4.1858, + "step": 4113 + }, + { + "epoch": 0.04114, + "grad_norm": 0.6398919299706648, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 4114 + }, + { + "epoch": 0.04115, + "grad_norm": 0.5856750517364259, + "learning_rate": 0.003, + "loss": 4.1371, + "step": 4115 + }, + { + "epoch": 0.04116, + "grad_norm": 0.5640052575174052, + "learning_rate": 0.003, + "loss": 4.1564, + "step": 4116 + }, + { + "epoch": 0.04117, + "grad_norm": 0.5578995001301751, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 4117 + }, + { + "epoch": 0.04118, + "grad_norm": 0.6189606161266092, + "learning_rate": 0.003, + "loss": 4.1802, + "step": 4118 + }, + { + "epoch": 0.04119, + "grad_norm": 0.69284741853494, + "learning_rate": 0.003, + "loss": 4.1574, + "step": 4119 + }, + { + "epoch": 0.0412, + "grad_norm": 0.8122773604588729, + "learning_rate": 0.003, + "loss": 4.1568, + "step": 4120 + }, + { + "epoch": 0.04121, + "grad_norm": 0.7788291402367661, + "learning_rate": 0.003, + "loss": 4.1805, + "step": 4121 + }, + { + "epoch": 0.04122, + "grad_norm": 0.7205856618274178, + "learning_rate": 0.003, + "loss": 4.1543, + "step": 4122 + }, + { + "epoch": 0.04123, + "grad_norm": 0.6723108027243482, + "learning_rate": 0.003, + "loss": 4.1631, + "step": 4123 + }, + { + "epoch": 0.04124, + "grad_norm": 0.5742707663821022, + "learning_rate": 0.003, + "loss": 4.138, + "step": 4124 + }, + { + "epoch": 0.04125, + "grad_norm": 0.5921662326911094, + "learning_rate": 0.003, + "loss": 4.1635, + "step": 4125 + }, + { + "epoch": 0.04126, + "grad_norm": 0.5065182010593955, + "learning_rate": 0.003, + "loss": 4.151, + "step": 4126 + }, + { + "epoch": 0.04127, + "grad_norm": 0.49270157976785267, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 4127 + }, + { + "epoch": 0.04128, + "grad_norm": 0.47833247711412197, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 4128 + }, + { + "epoch": 0.04129, + "grad_norm": 0.6245977843098431, + "learning_rate": 0.003, + "loss": 4.1696, + "step": 4129 + }, + { + "epoch": 0.0413, + "grad_norm": 0.7299723124473214, + "learning_rate": 0.003, + "loss": 4.1378, + "step": 4130 + }, + { + "epoch": 0.04131, + "grad_norm": 0.7562574300910655, + "learning_rate": 0.003, + "loss": 4.1695, + "step": 4131 + }, + { + "epoch": 0.04132, + "grad_norm": 0.7690429680218056, + "learning_rate": 0.003, + "loss": 4.1343, + "step": 4132 + }, + { + "epoch": 0.04133, + "grad_norm": 0.7790262395321094, + "learning_rate": 0.003, + "loss": 4.1785, + "step": 4133 + }, + { + "epoch": 0.04134, + "grad_norm": 0.7036209580453355, + "learning_rate": 0.003, + "loss": 4.1681, + "step": 4134 + }, + { + "epoch": 0.04135, + "grad_norm": 0.7431461288088913, + "learning_rate": 0.003, + "loss": 4.1528, + "step": 4135 + }, + { + "epoch": 0.04136, + "grad_norm": 0.7470037276613238, + "learning_rate": 0.003, + "loss": 4.1487, + "step": 4136 + }, + { + "epoch": 0.04137, + "grad_norm": 0.8449453146727118, + "learning_rate": 0.003, + "loss": 4.1972, + "step": 4137 + }, + { + "epoch": 0.04138, + "grad_norm": 0.6568218214769733, + "learning_rate": 0.003, + "loss": 4.1742, + "step": 4138 + }, + { + "epoch": 0.04139, + "grad_norm": 0.6371205783656457, + "learning_rate": 0.003, + "loss": 4.1718, + "step": 4139 + }, + { + "epoch": 0.0414, + "grad_norm": 0.724881052006821, + "learning_rate": 0.003, + "loss": 4.176, + "step": 4140 + }, + { + "epoch": 0.04141, + "grad_norm": 0.7146032900181981, + "learning_rate": 0.003, + "loss": 4.1537, + "step": 4141 + }, + { + "epoch": 0.04142, + "grad_norm": 0.8156917337253932, + "learning_rate": 0.003, + "loss": 4.1811, + "step": 4142 + }, + { + "epoch": 0.04143, + "grad_norm": 0.8785836902260167, + "learning_rate": 0.003, + "loss": 4.1747, + "step": 4143 + }, + { + "epoch": 0.04144, + "grad_norm": 0.7999443682311421, + "learning_rate": 0.003, + "loss": 4.1764, + "step": 4144 + }, + { + "epoch": 0.04145, + "grad_norm": 0.82858324833918, + "learning_rate": 0.003, + "loss": 4.1621, + "step": 4145 + }, + { + "epoch": 0.04146, + "grad_norm": 0.8235890608340272, + "learning_rate": 0.003, + "loss": 4.1882, + "step": 4146 + }, + { + "epoch": 0.04147, + "grad_norm": 0.858747508934936, + "learning_rate": 0.003, + "loss": 4.2011, + "step": 4147 + }, + { + "epoch": 0.04148, + "grad_norm": 0.7838271345732937, + "learning_rate": 0.003, + "loss": 4.1568, + "step": 4148 + }, + { + "epoch": 0.04149, + "grad_norm": 0.7263149003922105, + "learning_rate": 0.003, + "loss": 4.1664, + "step": 4149 + }, + { + "epoch": 0.0415, + "grad_norm": 0.6942802417486142, + "learning_rate": 0.003, + "loss": 4.1659, + "step": 4150 + }, + { + "epoch": 0.04151, + "grad_norm": 0.6248553867172758, + "learning_rate": 0.003, + "loss": 4.1813, + "step": 4151 + }, + { + "epoch": 0.04152, + "grad_norm": 0.6112599353340841, + "learning_rate": 0.003, + "loss": 4.1796, + "step": 4152 + }, + { + "epoch": 0.04153, + "grad_norm": 0.5629861808054807, + "learning_rate": 0.003, + "loss": 4.143, + "step": 4153 + }, + { + "epoch": 0.04154, + "grad_norm": 0.5649468221105395, + "learning_rate": 0.003, + "loss": 4.1757, + "step": 4154 + }, + { + "epoch": 0.04155, + "grad_norm": 0.5286657686017564, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 4155 + }, + { + "epoch": 0.04156, + "grad_norm": 0.4616483876621516, + "learning_rate": 0.003, + "loss": 4.1646, + "step": 4156 + }, + { + "epoch": 0.04157, + "grad_norm": 0.43208170797618906, + "learning_rate": 0.003, + "loss": 4.1775, + "step": 4157 + }, + { + "epoch": 0.04158, + "grad_norm": 0.47690558549648143, + "learning_rate": 0.003, + "loss": 4.1765, + "step": 4158 + }, + { + "epoch": 0.04159, + "grad_norm": 0.548409828074487, + "learning_rate": 0.003, + "loss": 4.1476, + "step": 4159 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5628758636950728, + "learning_rate": 0.003, + "loss": 4.1605, + "step": 4160 + }, + { + "epoch": 0.04161, + "grad_norm": 0.6122953755048903, + "learning_rate": 0.003, + "loss": 4.1666, + "step": 4161 + }, + { + "epoch": 0.04162, + "grad_norm": 0.5313833560293301, + "learning_rate": 0.003, + "loss": 4.1222, + "step": 4162 + }, + { + "epoch": 0.04163, + "grad_norm": 0.4545671521128938, + "learning_rate": 0.003, + "loss": 4.1485, + "step": 4163 + }, + { + "epoch": 0.04164, + "grad_norm": 0.48581450867041054, + "learning_rate": 0.003, + "loss": 4.169, + "step": 4164 + }, + { + "epoch": 0.04165, + "grad_norm": 0.48634350202313903, + "learning_rate": 0.003, + "loss": 4.1436, + "step": 4165 + }, + { + "epoch": 0.04166, + "grad_norm": 0.42940905174204824, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 4166 + }, + { + "epoch": 0.04167, + "grad_norm": 0.39230202494683986, + "learning_rate": 0.003, + "loss": 4.1526, + "step": 4167 + }, + { + "epoch": 0.04168, + "grad_norm": 0.39538400082145897, + "learning_rate": 0.003, + "loss": 4.1535, + "step": 4168 + }, + { + "epoch": 0.04169, + "grad_norm": 0.3447120216665393, + "learning_rate": 0.003, + "loss": 4.1582, + "step": 4169 + }, + { + "epoch": 0.0417, + "grad_norm": 0.36603795136119954, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 4170 + }, + { + "epoch": 0.04171, + "grad_norm": 0.38374101547618106, + "learning_rate": 0.003, + "loss": 4.1395, + "step": 4171 + }, + { + "epoch": 0.04172, + "grad_norm": 0.5162775730950696, + "learning_rate": 0.003, + "loss": 4.1617, + "step": 4172 + }, + { + "epoch": 0.04173, + "grad_norm": 0.5760736714087062, + "learning_rate": 0.003, + "loss": 4.1592, + "step": 4173 + }, + { + "epoch": 0.04174, + "grad_norm": 0.5576881465067848, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 4174 + }, + { + "epoch": 0.04175, + "grad_norm": 0.6077258689017496, + "learning_rate": 0.003, + "loss": 4.1771, + "step": 4175 + }, + { + "epoch": 0.04176, + "grad_norm": 0.6416498201931042, + "learning_rate": 0.003, + "loss": 4.1406, + "step": 4176 + }, + { + "epoch": 0.04177, + "grad_norm": 0.5865172170220208, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 4177 + }, + { + "epoch": 0.04178, + "grad_norm": 0.6121973272164818, + "learning_rate": 0.003, + "loss": 4.1495, + "step": 4178 + }, + { + "epoch": 0.04179, + "grad_norm": 0.7466591370840786, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 4179 + }, + { + "epoch": 0.0418, + "grad_norm": 0.7407658974314641, + "learning_rate": 0.003, + "loss": 4.1469, + "step": 4180 + }, + { + "epoch": 0.04181, + "grad_norm": 0.7876489798566603, + "learning_rate": 0.003, + "loss": 4.1518, + "step": 4181 + }, + { + "epoch": 0.04182, + "grad_norm": 0.6878533187156349, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 4182 + }, + { + "epoch": 0.04183, + "grad_norm": 0.6899295854896734, + "learning_rate": 0.003, + "loss": 4.1428, + "step": 4183 + }, + { + "epoch": 0.04184, + "grad_norm": 0.6688165143357564, + "learning_rate": 0.003, + "loss": 4.1979, + "step": 4184 + }, + { + "epoch": 0.04185, + "grad_norm": 0.6585905689769214, + "learning_rate": 0.003, + "loss": 4.1775, + "step": 4185 + }, + { + "epoch": 0.04186, + "grad_norm": 0.706982773019547, + "learning_rate": 0.003, + "loss": 4.1229, + "step": 4186 + }, + { + "epoch": 0.04187, + "grad_norm": 0.7286914544536822, + "learning_rate": 0.003, + "loss": 4.1792, + "step": 4187 + }, + { + "epoch": 0.04188, + "grad_norm": 0.7190589294145873, + "learning_rate": 0.003, + "loss": 4.1395, + "step": 4188 + }, + { + "epoch": 0.04189, + "grad_norm": 0.7351396129481365, + "learning_rate": 0.003, + "loss": 4.1607, + "step": 4189 + }, + { + "epoch": 0.0419, + "grad_norm": 0.7646912861138266, + "learning_rate": 0.003, + "loss": 4.1584, + "step": 4190 + }, + { + "epoch": 0.04191, + "grad_norm": 0.8912342474994416, + "learning_rate": 0.003, + "loss": 4.1703, + "step": 4191 + }, + { + "epoch": 0.04192, + "grad_norm": 0.8349167750236294, + "learning_rate": 0.003, + "loss": 4.155, + "step": 4192 + }, + { + "epoch": 0.04193, + "grad_norm": 0.8412143448929614, + "learning_rate": 0.003, + "loss": 4.139, + "step": 4193 + }, + { + "epoch": 0.04194, + "grad_norm": 0.7468627479626141, + "learning_rate": 0.003, + "loss": 4.1323, + "step": 4194 + }, + { + "epoch": 0.04195, + "grad_norm": 0.675962819896126, + "learning_rate": 0.003, + "loss": 4.1829, + "step": 4195 + }, + { + "epoch": 0.04196, + "grad_norm": 0.6834824762686134, + "learning_rate": 0.003, + "loss": 4.143, + "step": 4196 + }, + { + "epoch": 0.04197, + "grad_norm": 0.6470038410606324, + "learning_rate": 0.003, + "loss": 4.1519, + "step": 4197 + }, + { + "epoch": 0.04198, + "grad_norm": 0.6260474494865785, + "learning_rate": 0.003, + "loss": 4.1463, + "step": 4198 + }, + { + "epoch": 0.04199, + "grad_norm": 0.6233390255040572, + "learning_rate": 0.003, + "loss": 4.1821, + "step": 4199 + }, + { + "epoch": 0.042, + "grad_norm": 0.5419194177090347, + "learning_rate": 0.003, + "loss": 4.1585, + "step": 4200 + }, + { + "epoch": 0.04201, + "grad_norm": 0.4956778815254003, + "learning_rate": 0.003, + "loss": 4.1349, + "step": 4201 + }, + { + "epoch": 0.04202, + "grad_norm": 0.4424272529732883, + "learning_rate": 0.003, + "loss": 4.1485, + "step": 4202 + }, + { + "epoch": 0.04203, + "grad_norm": 0.4059745715455596, + "learning_rate": 0.003, + "loss": 4.1669, + "step": 4203 + }, + { + "epoch": 0.04204, + "grad_norm": 0.3551788544607894, + "learning_rate": 0.003, + "loss": 4.1305, + "step": 4204 + }, + { + "epoch": 0.04205, + "grad_norm": 0.36710978367184766, + "learning_rate": 0.003, + "loss": 4.1546, + "step": 4205 + }, + { + "epoch": 0.04206, + "grad_norm": 0.37067515609326535, + "learning_rate": 0.003, + "loss": 4.1466, + "step": 4206 + }, + { + "epoch": 0.04207, + "grad_norm": 0.3909030628374286, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 4207 + }, + { + "epoch": 0.04208, + "grad_norm": 0.3512353865469784, + "learning_rate": 0.003, + "loss": 4.1436, + "step": 4208 + }, + { + "epoch": 0.04209, + "grad_norm": 0.3222350876286574, + "learning_rate": 0.003, + "loss": 4.1456, + "step": 4209 + }, + { + "epoch": 0.0421, + "grad_norm": 0.33699670679266025, + "learning_rate": 0.003, + "loss": 4.103, + "step": 4210 + }, + { + "epoch": 0.04211, + "grad_norm": 0.397353308681094, + "learning_rate": 0.003, + "loss": 4.1419, + "step": 4211 + }, + { + "epoch": 0.04212, + "grad_norm": 0.4950752479622572, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 4212 + }, + { + "epoch": 0.04213, + "grad_norm": 0.6908228603245805, + "learning_rate": 0.003, + "loss": 4.1243, + "step": 4213 + }, + { + "epoch": 0.04214, + "grad_norm": 0.8171734853482235, + "learning_rate": 0.003, + "loss": 4.137, + "step": 4214 + }, + { + "epoch": 0.04215, + "grad_norm": 0.8297471255167204, + "learning_rate": 0.003, + "loss": 4.1618, + "step": 4215 + }, + { + "epoch": 0.04216, + "grad_norm": 0.8171229032271536, + "learning_rate": 0.003, + "loss": 4.1269, + "step": 4216 + }, + { + "epoch": 0.04217, + "grad_norm": 0.7119480146884566, + "learning_rate": 0.003, + "loss": 4.1515, + "step": 4217 + }, + { + "epoch": 0.04218, + "grad_norm": 0.7835951039751599, + "learning_rate": 0.003, + "loss": 4.1866, + "step": 4218 + }, + { + "epoch": 0.04219, + "grad_norm": 0.7238721651116449, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 4219 + }, + { + "epoch": 0.0422, + "grad_norm": 0.6704096903245251, + "learning_rate": 0.003, + "loss": 4.1709, + "step": 4220 + }, + { + "epoch": 0.04221, + "grad_norm": 0.6137772128626461, + "learning_rate": 0.003, + "loss": 4.1455, + "step": 4221 + }, + { + "epoch": 0.04222, + "grad_norm": 0.6312616992519636, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 4222 + }, + { + "epoch": 0.04223, + "grad_norm": 0.7309830489217892, + "learning_rate": 0.003, + "loss": 4.179, + "step": 4223 + }, + { + "epoch": 0.04224, + "grad_norm": 0.8353004708644942, + "learning_rate": 0.003, + "loss": 4.1874, + "step": 4224 + }, + { + "epoch": 0.04225, + "grad_norm": 0.8671668423571915, + "learning_rate": 0.003, + "loss": 4.1793, + "step": 4225 + }, + { + "epoch": 0.04226, + "grad_norm": 0.7341929503846356, + "learning_rate": 0.003, + "loss": 4.1611, + "step": 4226 + }, + { + "epoch": 0.04227, + "grad_norm": 0.6287599565012932, + "learning_rate": 0.003, + "loss": 4.1584, + "step": 4227 + }, + { + "epoch": 0.04228, + "grad_norm": 0.594514738745631, + "learning_rate": 0.003, + "loss": 4.1394, + "step": 4228 + }, + { + "epoch": 0.04229, + "grad_norm": 0.6229152206319786, + "learning_rate": 0.003, + "loss": 4.1691, + "step": 4229 + }, + { + "epoch": 0.0423, + "grad_norm": 0.6207671248596826, + "learning_rate": 0.003, + "loss": 4.1563, + "step": 4230 + }, + { + "epoch": 0.04231, + "grad_norm": 0.622706916945179, + "learning_rate": 0.003, + "loss": 4.1528, + "step": 4231 + }, + { + "epoch": 0.04232, + "grad_norm": 0.6141015860612352, + "learning_rate": 0.003, + "loss": 4.1626, + "step": 4232 + }, + { + "epoch": 0.04233, + "grad_norm": 0.5575040241097531, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 4233 + }, + { + "epoch": 0.04234, + "grad_norm": 0.5779738605421891, + "learning_rate": 0.003, + "loss": 4.147, + "step": 4234 + }, + { + "epoch": 0.04235, + "grad_norm": 0.6628268406822522, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 4235 + }, + { + "epoch": 0.04236, + "grad_norm": 0.5848188526492533, + "learning_rate": 0.003, + "loss": 4.1428, + "step": 4236 + }, + { + "epoch": 0.04237, + "grad_norm": 0.4606869163911272, + "learning_rate": 0.003, + "loss": 4.109, + "step": 4237 + }, + { + "epoch": 0.04238, + "grad_norm": 0.44274972044788236, + "learning_rate": 0.003, + "loss": 4.1695, + "step": 4238 + }, + { + "epoch": 0.04239, + "grad_norm": 0.4300796410055906, + "learning_rate": 0.003, + "loss": 4.15, + "step": 4239 + }, + { + "epoch": 0.0424, + "grad_norm": 0.3931497226975424, + "learning_rate": 0.003, + "loss": 4.1486, + "step": 4240 + }, + { + "epoch": 0.04241, + "grad_norm": 0.44468609283123023, + "learning_rate": 0.003, + "loss": 4.1623, + "step": 4241 + }, + { + "epoch": 0.04242, + "grad_norm": 0.42999491902207304, + "learning_rate": 0.003, + "loss": 4.1477, + "step": 4242 + }, + { + "epoch": 0.04243, + "grad_norm": 0.45444243019546493, + "learning_rate": 0.003, + "loss": 4.1314, + "step": 4243 + }, + { + "epoch": 0.04244, + "grad_norm": 0.4177997103887438, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 4244 + }, + { + "epoch": 0.04245, + "grad_norm": 0.4042766192791119, + "learning_rate": 0.003, + "loss": 4.1297, + "step": 4245 + }, + { + "epoch": 0.04246, + "grad_norm": 0.49465297913164696, + "learning_rate": 0.003, + "loss": 4.1578, + "step": 4246 + }, + { + "epoch": 0.04247, + "grad_norm": 0.7023548593149965, + "learning_rate": 0.003, + "loss": 4.1382, + "step": 4247 + }, + { + "epoch": 0.04248, + "grad_norm": 0.944713514521764, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 4248 + }, + { + "epoch": 0.04249, + "grad_norm": 0.9299347494145838, + "learning_rate": 0.003, + "loss": 4.1498, + "step": 4249 + }, + { + "epoch": 0.0425, + "grad_norm": 0.7051640509051822, + "learning_rate": 0.003, + "loss": 4.1595, + "step": 4250 + }, + { + "epoch": 0.04251, + "grad_norm": 0.6812273114666765, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 4251 + }, + { + "epoch": 0.04252, + "grad_norm": 0.6230213953232125, + "learning_rate": 0.003, + "loss": 4.1632, + "step": 4252 + }, + { + "epoch": 0.04253, + "grad_norm": 0.6107603990750144, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 4253 + }, + { + "epoch": 0.04254, + "grad_norm": 0.5714927077917459, + "learning_rate": 0.003, + "loss": 4.1498, + "step": 4254 + }, + { + "epoch": 0.04255, + "grad_norm": 0.5988656811798401, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 4255 + }, + { + "epoch": 0.04256, + "grad_norm": 0.6394087469893323, + "learning_rate": 0.003, + "loss": 4.1425, + "step": 4256 + }, + { + "epoch": 0.04257, + "grad_norm": 0.5967251959756738, + "learning_rate": 0.003, + "loss": 4.1615, + "step": 4257 + }, + { + "epoch": 0.04258, + "grad_norm": 0.5945289215936239, + "learning_rate": 0.003, + "loss": 4.1601, + "step": 4258 + }, + { + "epoch": 0.04259, + "grad_norm": 0.5867660553093561, + "learning_rate": 0.003, + "loss": 4.1269, + "step": 4259 + }, + { + "epoch": 0.0426, + "grad_norm": 0.48714297403717527, + "learning_rate": 0.003, + "loss": 4.1369, + "step": 4260 + }, + { + "epoch": 0.04261, + "grad_norm": 0.4560387546134075, + "learning_rate": 0.003, + "loss": 4.1483, + "step": 4261 + }, + { + "epoch": 0.04262, + "grad_norm": 0.43694421386753557, + "learning_rate": 0.003, + "loss": 4.1656, + "step": 4262 + }, + { + "epoch": 0.04263, + "grad_norm": 0.474489343873346, + "learning_rate": 0.003, + "loss": 4.1465, + "step": 4263 + }, + { + "epoch": 0.04264, + "grad_norm": 0.48969639817641303, + "learning_rate": 0.003, + "loss": 4.1564, + "step": 4264 + }, + { + "epoch": 0.04265, + "grad_norm": 0.48935865768007264, + "learning_rate": 0.003, + "loss": 4.1541, + "step": 4265 + }, + { + "epoch": 0.04266, + "grad_norm": 0.4739244721175001, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 4266 + }, + { + "epoch": 0.04267, + "grad_norm": 0.48494742028757803, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 4267 + }, + { + "epoch": 0.04268, + "grad_norm": 0.5422961243923582, + "learning_rate": 0.003, + "loss": 4.1505, + "step": 4268 + }, + { + "epoch": 0.04269, + "grad_norm": 0.5460647987973027, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 4269 + }, + { + "epoch": 0.0427, + "grad_norm": 0.608961907674264, + "learning_rate": 0.003, + "loss": 4.134, + "step": 4270 + }, + { + "epoch": 0.04271, + "grad_norm": 0.6558443912098731, + "learning_rate": 0.003, + "loss": 4.1504, + "step": 4271 + }, + { + "epoch": 0.04272, + "grad_norm": 0.663079753729842, + "learning_rate": 0.003, + "loss": 4.1398, + "step": 4272 + }, + { + "epoch": 0.04273, + "grad_norm": 0.7592770165276204, + "learning_rate": 0.003, + "loss": 4.1263, + "step": 4273 + }, + { + "epoch": 0.04274, + "grad_norm": 0.6676368129361678, + "learning_rate": 0.003, + "loss": 4.1466, + "step": 4274 + }, + { + "epoch": 0.04275, + "grad_norm": 0.6623028472426901, + "learning_rate": 0.003, + "loss": 4.1522, + "step": 4275 + }, + { + "epoch": 0.04276, + "grad_norm": 0.6624560252528698, + "learning_rate": 0.003, + "loss": 4.1719, + "step": 4276 + }, + { + "epoch": 0.04277, + "grad_norm": 0.6210345683298933, + "learning_rate": 0.003, + "loss": 4.1563, + "step": 4277 + }, + { + "epoch": 0.04278, + "grad_norm": 0.5991391029139438, + "learning_rate": 0.003, + "loss": 4.1582, + "step": 4278 + }, + { + "epoch": 0.04279, + "grad_norm": 0.6350138668928702, + "learning_rate": 0.003, + "loss": 4.1353, + "step": 4279 + }, + { + "epoch": 0.0428, + "grad_norm": 0.6849523452457494, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 4280 + }, + { + "epoch": 0.04281, + "grad_norm": 0.6921302141996111, + "learning_rate": 0.003, + "loss": 4.1786, + "step": 4281 + }, + { + "epoch": 0.04282, + "grad_norm": 0.756590885346882, + "learning_rate": 0.003, + "loss": 4.1541, + "step": 4282 + }, + { + "epoch": 0.04283, + "grad_norm": 0.7245456429489706, + "learning_rate": 0.003, + "loss": 4.1927, + "step": 4283 + }, + { + "epoch": 0.04284, + "grad_norm": 0.6672309834707763, + "learning_rate": 0.003, + "loss": 4.1425, + "step": 4284 + }, + { + "epoch": 0.04285, + "grad_norm": 0.6246574113348472, + "learning_rate": 0.003, + "loss": 4.157, + "step": 4285 + }, + { + "epoch": 0.04286, + "grad_norm": 0.4846532441701148, + "learning_rate": 0.003, + "loss": 4.1698, + "step": 4286 + }, + { + "epoch": 0.04287, + "grad_norm": 0.49611202994371745, + "learning_rate": 0.003, + "loss": 4.1484, + "step": 4287 + }, + { + "epoch": 0.04288, + "grad_norm": 0.49123248349204507, + "learning_rate": 0.003, + "loss": 4.1284, + "step": 4288 + }, + { + "epoch": 0.04289, + "grad_norm": 0.4112901668688879, + "learning_rate": 0.003, + "loss": 4.1338, + "step": 4289 + }, + { + "epoch": 0.0429, + "grad_norm": 0.38028576043319295, + "learning_rate": 0.003, + "loss": 4.1169, + "step": 4290 + }, + { + "epoch": 0.04291, + "grad_norm": 0.3996251945749772, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 4291 + }, + { + "epoch": 0.04292, + "grad_norm": 0.43349568162385765, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 4292 + }, + { + "epoch": 0.04293, + "grad_norm": 0.5220793875805044, + "learning_rate": 0.003, + "loss": 4.1623, + "step": 4293 + }, + { + "epoch": 0.04294, + "grad_norm": 0.6050495154218054, + "learning_rate": 0.003, + "loss": 4.1458, + "step": 4294 + }, + { + "epoch": 0.04295, + "grad_norm": 0.7860329987891421, + "learning_rate": 0.003, + "loss": 4.1263, + "step": 4295 + }, + { + "epoch": 0.04296, + "grad_norm": 0.9004405617366277, + "learning_rate": 0.003, + "loss": 4.1327, + "step": 4296 + }, + { + "epoch": 0.04297, + "grad_norm": 0.8479021177195584, + "learning_rate": 0.003, + "loss": 4.1235, + "step": 4297 + }, + { + "epoch": 0.04298, + "grad_norm": 0.7336646368221134, + "learning_rate": 0.003, + "loss": 4.1664, + "step": 4298 + }, + { + "epoch": 0.04299, + "grad_norm": 0.7078429400943396, + "learning_rate": 0.003, + "loss": 4.1542, + "step": 4299 + }, + { + "epoch": 0.043, + "grad_norm": 0.7513235769819381, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 4300 + }, + { + "epoch": 0.04301, + "grad_norm": 0.8326441152753217, + "learning_rate": 0.003, + "loss": 4.1932, + "step": 4301 + }, + { + "epoch": 0.04302, + "grad_norm": 0.6815941455473161, + "learning_rate": 0.003, + "loss": 4.176, + "step": 4302 + }, + { + "epoch": 0.04303, + "grad_norm": 0.6890232000405212, + "learning_rate": 0.003, + "loss": 4.1666, + "step": 4303 + }, + { + "epoch": 0.04304, + "grad_norm": 0.6319808958937929, + "learning_rate": 0.003, + "loss": 4.1606, + "step": 4304 + }, + { + "epoch": 0.04305, + "grad_norm": 0.5899508711069417, + "learning_rate": 0.003, + "loss": 4.1579, + "step": 4305 + }, + { + "epoch": 0.04306, + "grad_norm": 0.5383422790486718, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 4306 + }, + { + "epoch": 0.04307, + "grad_norm": 0.48802486139669815, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 4307 + }, + { + "epoch": 0.04308, + "grad_norm": 0.5309500644286907, + "learning_rate": 0.003, + "loss": 4.1512, + "step": 4308 + }, + { + "epoch": 0.04309, + "grad_norm": 0.5454729884280429, + "learning_rate": 0.003, + "loss": 4.1364, + "step": 4309 + }, + { + "epoch": 0.0431, + "grad_norm": 0.5943838417760108, + "learning_rate": 0.003, + "loss": 4.163, + "step": 4310 + }, + { + "epoch": 0.04311, + "grad_norm": 0.6278301410987507, + "learning_rate": 0.003, + "loss": 4.1581, + "step": 4311 + }, + { + "epoch": 0.04312, + "grad_norm": 0.523985785894097, + "learning_rate": 0.003, + "loss": 4.134, + "step": 4312 + }, + { + "epoch": 0.04313, + "grad_norm": 0.4476351084578838, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 4313 + }, + { + "epoch": 0.04314, + "grad_norm": 0.43202288706395875, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 4314 + }, + { + "epoch": 0.04315, + "grad_norm": 0.47058217727703, + "learning_rate": 0.003, + "loss": 4.1513, + "step": 4315 + }, + { + "epoch": 0.04316, + "grad_norm": 0.5151993523236311, + "learning_rate": 0.003, + "loss": 4.1515, + "step": 4316 + }, + { + "epoch": 0.04317, + "grad_norm": 0.6504034739996317, + "learning_rate": 0.003, + "loss": 4.1687, + "step": 4317 + }, + { + "epoch": 0.04318, + "grad_norm": 0.722856030914651, + "learning_rate": 0.003, + "loss": 4.1142, + "step": 4318 + }, + { + "epoch": 0.04319, + "grad_norm": 0.7408662854196157, + "learning_rate": 0.003, + "loss": 4.1104, + "step": 4319 + }, + { + "epoch": 0.0432, + "grad_norm": 0.6108907111379617, + "learning_rate": 0.003, + "loss": 4.131, + "step": 4320 + }, + { + "epoch": 0.04321, + "grad_norm": 0.5580728193515315, + "learning_rate": 0.003, + "loss": 4.1349, + "step": 4321 + }, + { + "epoch": 0.04322, + "grad_norm": 0.6293577640178539, + "learning_rate": 0.003, + "loss": 4.1718, + "step": 4322 + }, + { + "epoch": 0.04323, + "grad_norm": 0.680052451325024, + "learning_rate": 0.003, + "loss": 4.139, + "step": 4323 + }, + { + "epoch": 0.04324, + "grad_norm": 0.6420666473310551, + "learning_rate": 0.003, + "loss": 4.134, + "step": 4324 + }, + { + "epoch": 0.04325, + "grad_norm": 0.7103856885257132, + "learning_rate": 0.003, + "loss": 4.1668, + "step": 4325 + }, + { + "epoch": 0.04326, + "grad_norm": 0.6921778251891293, + "learning_rate": 0.003, + "loss": 4.1469, + "step": 4326 + }, + { + "epoch": 0.04327, + "grad_norm": 0.688475290776878, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 4327 + }, + { + "epoch": 0.04328, + "grad_norm": 0.6182648113705878, + "learning_rate": 0.003, + "loss": 4.1423, + "step": 4328 + }, + { + "epoch": 0.04329, + "grad_norm": 0.6893506649601877, + "learning_rate": 0.003, + "loss": 4.1652, + "step": 4329 + }, + { + "epoch": 0.0433, + "grad_norm": 0.7382201356692273, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 4330 + }, + { + "epoch": 0.04331, + "grad_norm": 0.6908297901661757, + "learning_rate": 0.003, + "loss": 4.1331, + "step": 4331 + }, + { + "epoch": 0.04332, + "grad_norm": 0.6538224132935573, + "learning_rate": 0.003, + "loss": 4.1798, + "step": 4332 + }, + { + "epoch": 0.04333, + "grad_norm": 0.5839042658168669, + "learning_rate": 0.003, + "loss": 4.1477, + "step": 4333 + }, + { + "epoch": 0.04334, + "grad_norm": 0.5113278780509889, + "learning_rate": 0.003, + "loss": 4.1337, + "step": 4334 + }, + { + "epoch": 0.04335, + "grad_norm": 0.5145875368624392, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 4335 + }, + { + "epoch": 0.04336, + "grad_norm": 0.45945029823422817, + "learning_rate": 0.003, + "loss": 4.1538, + "step": 4336 + }, + { + "epoch": 0.04337, + "grad_norm": 0.48662366939348867, + "learning_rate": 0.003, + "loss": 4.1382, + "step": 4337 + }, + { + "epoch": 0.04338, + "grad_norm": 0.5497905856065477, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 4338 + }, + { + "epoch": 0.04339, + "grad_norm": 0.6427639317081701, + "learning_rate": 0.003, + "loss": 4.1499, + "step": 4339 + }, + { + "epoch": 0.0434, + "grad_norm": 0.751540691985128, + "learning_rate": 0.003, + "loss": 4.1357, + "step": 4340 + }, + { + "epoch": 0.04341, + "grad_norm": 0.8048212003756245, + "learning_rate": 0.003, + "loss": 4.153, + "step": 4341 + }, + { + "epoch": 0.04342, + "grad_norm": 0.7030263986542218, + "learning_rate": 0.003, + "loss": 4.1364, + "step": 4342 + }, + { + "epoch": 0.04343, + "grad_norm": 0.6032636490778056, + "learning_rate": 0.003, + "loss": 4.1739, + "step": 4343 + }, + { + "epoch": 0.04344, + "grad_norm": 0.6918101884406253, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 4344 + }, + { + "epoch": 0.04345, + "grad_norm": 0.822156034408674, + "learning_rate": 0.003, + "loss": 4.1698, + "step": 4345 + }, + { + "epoch": 0.04346, + "grad_norm": 0.8112944749590358, + "learning_rate": 0.003, + "loss": 4.1692, + "step": 4346 + }, + { + "epoch": 0.04347, + "grad_norm": 0.7773363458697162, + "learning_rate": 0.003, + "loss": 4.1737, + "step": 4347 + }, + { + "epoch": 0.04348, + "grad_norm": 0.7166070078793744, + "learning_rate": 0.003, + "loss": 4.1512, + "step": 4348 + }, + { + "epoch": 0.04349, + "grad_norm": 0.7557376957323917, + "learning_rate": 0.003, + "loss": 4.1704, + "step": 4349 + }, + { + "epoch": 0.0435, + "grad_norm": 0.7306221155287103, + "learning_rate": 0.003, + "loss": 4.1582, + "step": 4350 + }, + { + "epoch": 0.04351, + "grad_norm": 0.7396583467273142, + "learning_rate": 0.003, + "loss": 4.1687, + "step": 4351 + }, + { + "epoch": 0.04352, + "grad_norm": 0.7800507975012694, + "learning_rate": 0.003, + "loss": 4.1888, + "step": 4352 + }, + { + "epoch": 0.04353, + "grad_norm": 1.021228379310444, + "learning_rate": 0.003, + "loss": 4.1744, + "step": 4353 + }, + { + "epoch": 0.04354, + "grad_norm": 0.9464406672758259, + "learning_rate": 0.003, + "loss": 4.1858, + "step": 4354 + }, + { + "epoch": 0.04355, + "grad_norm": 0.9680937005908226, + "learning_rate": 0.003, + "loss": 4.19, + "step": 4355 + }, + { + "epoch": 0.04356, + "grad_norm": 0.8607469609935269, + "learning_rate": 0.003, + "loss": 4.2014, + "step": 4356 + }, + { + "epoch": 0.04357, + "grad_norm": 0.823519837090474, + "learning_rate": 0.003, + "loss": 4.1828, + "step": 4357 + }, + { + "epoch": 0.04358, + "grad_norm": 0.9595628631285592, + "learning_rate": 0.003, + "loss": 4.181, + "step": 4358 + }, + { + "epoch": 0.04359, + "grad_norm": 0.9518657368376723, + "learning_rate": 0.003, + "loss": 4.2091, + "step": 4359 + }, + { + "epoch": 0.0436, + "grad_norm": 0.8367488853787286, + "learning_rate": 0.003, + "loss": 4.2039, + "step": 4360 + }, + { + "epoch": 0.04361, + "grad_norm": 0.6767616158798689, + "learning_rate": 0.003, + "loss": 4.1626, + "step": 4361 + }, + { + "epoch": 0.04362, + "grad_norm": 0.6754203563658906, + "learning_rate": 0.003, + "loss": 4.1747, + "step": 4362 + }, + { + "epoch": 0.04363, + "grad_norm": 0.6302727609189303, + "learning_rate": 0.003, + "loss": 4.1735, + "step": 4363 + }, + { + "epoch": 0.04364, + "grad_norm": 0.6320795297761237, + "learning_rate": 0.003, + "loss": 4.1873, + "step": 4364 + }, + { + "epoch": 0.04365, + "grad_norm": 0.6155242365026946, + "learning_rate": 0.003, + "loss": 4.1835, + "step": 4365 + }, + { + "epoch": 0.04366, + "grad_norm": 0.5750840343277154, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 4366 + }, + { + "epoch": 0.04367, + "grad_norm": 0.5395062951089282, + "learning_rate": 0.003, + "loss": 4.1827, + "step": 4367 + }, + { + "epoch": 0.04368, + "grad_norm": 0.5070165557962105, + "learning_rate": 0.003, + "loss": 4.1657, + "step": 4368 + }, + { + "epoch": 0.04369, + "grad_norm": 0.469120529046112, + "learning_rate": 0.003, + "loss": 4.1558, + "step": 4369 + }, + { + "epoch": 0.0437, + "grad_norm": 0.42165323317321096, + "learning_rate": 0.003, + "loss": 4.178, + "step": 4370 + }, + { + "epoch": 0.04371, + "grad_norm": 0.4102155524937744, + "learning_rate": 0.003, + "loss": 4.1607, + "step": 4371 + }, + { + "epoch": 0.04372, + "grad_norm": 0.45131221099147395, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 4372 + }, + { + "epoch": 0.04373, + "grad_norm": 0.5064889554273249, + "learning_rate": 0.003, + "loss": 4.1634, + "step": 4373 + }, + { + "epoch": 0.04374, + "grad_norm": 0.6523859432482797, + "learning_rate": 0.003, + "loss": 4.2092, + "step": 4374 + }, + { + "epoch": 0.04375, + "grad_norm": 0.7145171139398658, + "learning_rate": 0.003, + "loss": 4.1237, + "step": 4375 + }, + { + "epoch": 0.04376, + "grad_norm": 0.6067647597260136, + "learning_rate": 0.003, + "loss": 4.1632, + "step": 4376 + }, + { + "epoch": 0.04377, + "grad_norm": 0.4114371807638281, + "learning_rate": 0.003, + "loss": 4.1817, + "step": 4377 + }, + { + "epoch": 0.04378, + "grad_norm": 0.4767337347167691, + "learning_rate": 0.003, + "loss": 4.1646, + "step": 4378 + }, + { + "epoch": 0.04379, + "grad_norm": 0.5776389327364934, + "learning_rate": 0.003, + "loss": 4.1831, + "step": 4379 + }, + { + "epoch": 0.0438, + "grad_norm": 0.5129468592408087, + "learning_rate": 0.003, + "loss": 4.1488, + "step": 4380 + }, + { + "epoch": 0.04381, + "grad_norm": 0.4520287221841305, + "learning_rate": 0.003, + "loss": 4.1367, + "step": 4381 + }, + { + "epoch": 0.04382, + "grad_norm": 0.5197515685911448, + "learning_rate": 0.003, + "loss": 4.1497, + "step": 4382 + }, + { + "epoch": 0.04383, + "grad_norm": 0.5177934608058117, + "learning_rate": 0.003, + "loss": 4.1269, + "step": 4383 + }, + { + "epoch": 0.04384, + "grad_norm": 0.5061115119488518, + "learning_rate": 0.003, + "loss": 4.1319, + "step": 4384 + }, + { + "epoch": 0.04385, + "grad_norm": 0.4979837350700543, + "learning_rate": 0.003, + "loss": 4.168, + "step": 4385 + }, + { + "epoch": 0.04386, + "grad_norm": 0.5170426202552209, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 4386 + }, + { + "epoch": 0.04387, + "grad_norm": 0.497700525982779, + "learning_rate": 0.003, + "loss": 4.164, + "step": 4387 + }, + { + "epoch": 0.04388, + "grad_norm": 0.5110241434141121, + "learning_rate": 0.003, + "loss": 4.125, + "step": 4388 + }, + { + "epoch": 0.04389, + "grad_norm": 0.47594850470553707, + "learning_rate": 0.003, + "loss": 4.1555, + "step": 4389 + }, + { + "epoch": 0.0439, + "grad_norm": 0.5028500411846144, + "learning_rate": 0.003, + "loss": 4.1519, + "step": 4390 + }, + { + "epoch": 0.04391, + "grad_norm": 0.4958670611124616, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 4391 + }, + { + "epoch": 0.04392, + "grad_norm": 0.45433519309198606, + "learning_rate": 0.003, + "loss": 4.129, + "step": 4392 + }, + { + "epoch": 0.04393, + "grad_norm": 0.42745058981506323, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 4393 + }, + { + "epoch": 0.04394, + "grad_norm": 0.4270590633401501, + "learning_rate": 0.003, + "loss": 4.1354, + "step": 4394 + }, + { + "epoch": 0.04395, + "grad_norm": 0.48083329680294346, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 4395 + }, + { + "epoch": 0.04396, + "grad_norm": 0.4459452323390242, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 4396 + }, + { + "epoch": 0.04397, + "grad_norm": 0.4716168217418122, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 4397 + }, + { + "epoch": 0.04398, + "grad_norm": 0.5721086479882349, + "learning_rate": 0.003, + "loss": 4.1297, + "step": 4398 + }, + { + "epoch": 0.04399, + "grad_norm": 0.629767772703653, + "learning_rate": 0.003, + "loss": 4.1394, + "step": 4399 + }, + { + "epoch": 0.044, + "grad_norm": 0.6149549402399486, + "learning_rate": 0.003, + "loss": 4.1329, + "step": 4400 + }, + { + "epoch": 0.04401, + "grad_norm": 0.573854062146023, + "learning_rate": 0.003, + "loss": 4.1469, + "step": 4401 + }, + { + "epoch": 0.04402, + "grad_norm": 0.5937371470991296, + "learning_rate": 0.003, + "loss": 4.1514, + "step": 4402 + }, + { + "epoch": 0.04403, + "grad_norm": 0.7181194724479543, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 4403 + }, + { + "epoch": 0.04404, + "grad_norm": 0.7454554555523588, + "learning_rate": 0.003, + "loss": 4.1537, + "step": 4404 + }, + { + "epoch": 0.04405, + "grad_norm": 0.7902792519841142, + "learning_rate": 0.003, + "loss": 4.1518, + "step": 4405 + }, + { + "epoch": 0.04406, + "grad_norm": 0.7207216322892295, + "learning_rate": 0.003, + "loss": 4.1505, + "step": 4406 + }, + { + "epoch": 0.04407, + "grad_norm": 0.6745132067112365, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 4407 + }, + { + "epoch": 0.04408, + "grad_norm": 0.6455801650691928, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 4408 + }, + { + "epoch": 0.04409, + "grad_norm": 0.6394593941217028, + "learning_rate": 0.003, + "loss": 4.1634, + "step": 4409 + }, + { + "epoch": 0.0441, + "grad_norm": 0.49947842679511295, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 4410 + }, + { + "epoch": 0.04411, + "grad_norm": 0.49572809609459384, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 4411 + }, + { + "epoch": 0.04412, + "grad_norm": 0.5979052104111249, + "learning_rate": 0.003, + "loss": 4.116, + "step": 4412 + }, + { + "epoch": 0.04413, + "grad_norm": 0.7297806765539856, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 4413 + }, + { + "epoch": 0.04414, + "grad_norm": 0.8440545970233042, + "learning_rate": 0.003, + "loss": 4.1895, + "step": 4414 + }, + { + "epoch": 0.04415, + "grad_norm": 0.8680059315838015, + "learning_rate": 0.003, + "loss": 4.1572, + "step": 4415 + }, + { + "epoch": 0.04416, + "grad_norm": 0.6447487616588353, + "learning_rate": 0.003, + "loss": 4.1383, + "step": 4416 + }, + { + "epoch": 0.04417, + "grad_norm": 0.5118950493406013, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 4417 + }, + { + "epoch": 0.04418, + "grad_norm": 0.524336962344827, + "learning_rate": 0.003, + "loss": 4.1536, + "step": 4418 + }, + { + "epoch": 0.04419, + "grad_norm": 0.5573856100230878, + "learning_rate": 0.003, + "loss": 4.1718, + "step": 4419 + }, + { + "epoch": 0.0442, + "grad_norm": 0.5231826126803166, + "learning_rate": 0.003, + "loss": 4.1397, + "step": 4420 + }, + { + "epoch": 0.04421, + "grad_norm": 0.5267586062051023, + "learning_rate": 0.003, + "loss": 4.1727, + "step": 4421 + }, + { + "epoch": 0.04422, + "grad_norm": 0.6148173783561381, + "learning_rate": 0.003, + "loss": 4.1362, + "step": 4422 + }, + { + "epoch": 0.04423, + "grad_norm": 0.639677614962493, + "learning_rate": 0.003, + "loss": 4.1571, + "step": 4423 + }, + { + "epoch": 0.04424, + "grad_norm": 0.7269549825343812, + "learning_rate": 0.003, + "loss": 4.108, + "step": 4424 + }, + { + "epoch": 0.04425, + "grad_norm": 0.7800602356462868, + "learning_rate": 0.003, + "loss": 4.1375, + "step": 4425 + }, + { + "epoch": 0.04426, + "grad_norm": 0.6552710671788101, + "learning_rate": 0.003, + "loss": 4.1586, + "step": 4426 + }, + { + "epoch": 0.04427, + "grad_norm": 0.6600251269485187, + "learning_rate": 0.003, + "loss": 4.1459, + "step": 4427 + }, + { + "epoch": 0.04428, + "grad_norm": 0.7074082478081082, + "learning_rate": 0.003, + "loss": 4.1438, + "step": 4428 + }, + { + "epoch": 0.04429, + "grad_norm": 0.6687710065867836, + "learning_rate": 0.003, + "loss": 4.1838, + "step": 4429 + }, + { + "epoch": 0.0443, + "grad_norm": 0.6237696770618166, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 4430 + }, + { + "epoch": 0.04431, + "grad_norm": 0.5184995550940078, + "learning_rate": 0.003, + "loss": 4.1591, + "step": 4431 + }, + { + "epoch": 0.04432, + "grad_norm": 0.5237611236731565, + "learning_rate": 0.003, + "loss": 4.1643, + "step": 4432 + }, + { + "epoch": 0.04433, + "grad_norm": 0.4836240843515611, + "learning_rate": 0.003, + "loss": 4.136, + "step": 4433 + }, + { + "epoch": 0.04434, + "grad_norm": 0.4375263143804872, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 4434 + }, + { + "epoch": 0.04435, + "grad_norm": 0.4294571339856274, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 4435 + }, + { + "epoch": 0.04436, + "grad_norm": 0.4235691217798773, + "learning_rate": 0.003, + "loss": 4.119, + "step": 4436 + }, + { + "epoch": 0.04437, + "grad_norm": 0.38987028812840846, + "learning_rate": 0.003, + "loss": 4.1415, + "step": 4437 + }, + { + "epoch": 0.04438, + "grad_norm": 0.366302357806493, + "learning_rate": 0.003, + "loss": 4.1767, + "step": 4438 + }, + { + "epoch": 0.04439, + "grad_norm": 0.39300818537655363, + "learning_rate": 0.003, + "loss": 4.1452, + "step": 4439 + }, + { + "epoch": 0.0444, + "grad_norm": 0.4338089650520033, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 4440 + }, + { + "epoch": 0.04441, + "grad_norm": 0.48573366379460514, + "learning_rate": 0.003, + "loss": 4.1349, + "step": 4441 + }, + { + "epoch": 0.04442, + "grad_norm": 0.5334130138073742, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 4442 + }, + { + "epoch": 0.04443, + "grad_norm": 0.5670666571686344, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 4443 + }, + { + "epoch": 0.04444, + "grad_norm": 0.5320583177110734, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 4444 + }, + { + "epoch": 0.04445, + "grad_norm": 0.489885387553983, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 4445 + }, + { + "epoch": 0.04446, + "grad_norm": 0.559048561777276, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 4446 + }, + { + "epoch": 0.04447, + "grad_norm": 0.6964141693448826, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 4447 + }, + { + "epoch": 0.04448, + "grad_norm": 0.7839348711029371, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 4448 + }, + { + "epoch": 0.04449, + "grad_norm": 0.8168410712463996, + "learning_rate": 0.003, + "loss": 4.1417, + "step": 4449 + }, + { + "epoch": 0.0445, + "grad_norm": 0.7469909706513354, + "learning_rate": 0.003, + "loss": 4.1433, + "step": 4450 + }, + { + "epoch": 0.04451, + "grad_norm": 0.7772290265480061, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 4451 + }, + { + "epoch": 0.04452, + "grad_norm": 0.750429713415397, + "learning_rate": 0.003, + "loss": 4.1606, + "step": 4452 + }, + { + "epoch": 0.04453, + "grad_norm": 0.8132686578476264, + "learning_rate": 0.003, + "loss": 4.1734, + "step": 4453 + }, + { + "epoch": 0.04454, + "grad_norm": 0.7560392406490976, + "learning_rate": 0.003, + "loss": 4.1543, + "step": 4454 + }, + { + "epoch": 0.04455, + "grad_norm": 0.6772193323346015, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 4455 + }, + { + "epoch": 0.04456, + "grad_norm": 0.6001846034669396, + "learning_rate": 0.003, + "loss": 4.1249, + "step": 4456 + }, + { + "epoch": 0.04457, + "grad_norm": 0.555927693348537, + "learning_rate": 0.003, + "loss": 4.1401, + "step": 4457 + }, + { + "epoch": 0.04458, + "grad_norm": 0.567937186369503, + "learning_rate": 0.003, + "loss": 4.1625, + "step": 4458 + }, + { + "epoch": 0.04459, + "grad_norm": 0.6268058209460201, + "learning_rate": 0.003, + "loss": 4.1697, + "step": 4459 + }, + { + "epoch": 0.0446, + "grad_norm": 0.6774954694650952, + "learning_rate": 0.003, + "loss": 4.1699, + "step": 4460 + }, + { + "epoch": 0.04461, + "grad_norm": 0.7113381022778253, + "learning_rate": 0.003, + "loss": 4.1744, + "step": 4461 + }, + { + "epoch": 0.04462, + "grad_norm": 0.7089477778475621, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 4462 + }, + { + "epoch": 0.04463, + "grad_norm": 0.7149044485381956, + "learning_rate": 0.003, + "loss": 4.1433, + "step": 4463 + }, + { + "epoch": 0.04464, + "grad_norm": 0.7001719619214657, + "learning_rate": 0.003, + "loss": 4.1603, + "step": 4464 + }, + { + "epoch": 0.04465, + "grad_norm": 0.641827326565005, + "learning_rate": 0.003, + "loss": 4.1468, + "step": 4465 + }, + { + "epoch": 0.04466, + "grad_norm": 0.5710205222869948, + "learning_rate": 0.003, + "loss": 4.1542, + "step": 4466 + }, + { + "epoch": 0.04467, + "grad_norm": 0.5879622422772139, + "learning_rate": 0.003, + "loss": 4.1512, + "step": 4467 + }, + { + "epoch": 0.04468, + "grad_norm": 0.5833275454344445, + "learning_rate": 0.003, + "loss": 4.1253, + "step": 4468 + }, + { + "epoch": 0.04469, + "grad_norm": 0.5035664280245044, + "learning_rate": 0.003, + "loss": 4.1536, + "step": 4469 + }, + { + "epoch": 0.0447, + "grad_norm": 0.5330652391453462, + "learning_rate": 0.003, + "loss": 4.1565, + "step": 4470 + }, + { + "epoch": 0.04471, + "grad_norm": 0.5415312535775053, + "learning_rate": 0.003, + "loss": 4.1754, + "step": 4471 + }, + { + "epoch": 0.04472, + "grad_norm": 0.6201601279104281, + "learning_rate": 0.003, + "loss": 4.129, + "step": 4472 + }, + { + "epoch": 0.04473, + "grad_norm": 0.5823075422954469, + "learning_rate": 0.003, + "loss": 4.1304, + "step": 4473 + }, + { + "epoch": 0.04474, + "grad_norm": 0.5211525998630241, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 4474 + }, + { + "epoch": 0.04475, + "grad_norm": 0.5787022872529871, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 4475 + }, + { + "epoch": 0.04476, + "grad_norm": 0.6268645285401386, + "learning_rate": 0.003, + "loss": 4.1321, + "step": 4476 + }, + { + "epoch": 0.04477, + "grad_norm": 0.7238878952105227, + "learning_rate": 0.003, + "loss": 4.1527, + "step": 4477 + }, + { + "epoch": 0.04478, + "grad_norm": 0.7625353869137804, + "learning_rate": 0.003, + "loss": 4.1522, + "step": 4478 + }, + { + "epoch": 0.04479, + "grad_norm": 0.7572839617963258, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 4479 + }, + { + "epoch": 0.0448, + "grad_norm": 0.7894575112052167, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 4480 + }, + { + "epoch": 0.04481, + "grad_norm": 0.74839779052581, + "learning_rate": 0.003, + "loss": 4.1423, + "step": 4481 + }, + { + "epoch": 0.04482, + "grad_norm": 0.7334427098809556, + "learning_rate": 0.003, + "loss": 4.1657, + "step": 4482 + }, + { + "epoch": 0.04483, + "grad_norm": 0.7262695320930564, + "learning_rate": 0.003, + "loss": 4.152, + "step": 4483 + }, + { + "epoch": 0.04484, + "grad_norm": 0.6647684381777006, + "learning_rate": 0.003, + "loss": 4.165, + "step": 4484 + }, + { + "epoch": 0.04485, + "grad_norm": 0.6381708748446918, + "learning_rate": 0.003, + "loss": 4.1719, + "step": 4485 + }, + { + "epoch": 0.04486, + "grad_norm": 0.5675731699113569, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 4486 + }, + { + "epoch": 0.04487, + "grad_norm": 0.6472424225913131, + "learning_rate": 0.003, + "loss": 4.1412, + "step": 4487 + }, + { + "epoch": 0.04488, + "grad_norm": 0.694626079092874, + "learning_rate": 0.003, + "loss": 4.1617, + "step": 4488 + }, + { + "epoch": 0.04489, + "grad_norm": 0.6658909000300576, + "learning_rate": 0.003, + "loss": 4.1656, + "step": 4489 + }, + { + "epoch": 0.0449, + "grad_norm": 0.5596108130436326, + "learning_rate": 0.003, + "loss": 4.156, + "step": 4490 + }, + { + "epoch": 0.04491, + "grad_norm": 0.4950212459279141, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 4491 + }, + { + "epoch": 0.04492, + "grad_norm": 0.5347911305158913, + "learning_rate": 0.003, + "loss": 4.1536, + "step": 4492 + }, + { + "epoch": 0.04493, + "grad_norm": 0.583677850500047, + "learning_rate": 0.003, + "loss": 4.1599, + "step": 4493 + }, + { + "epoch": 0.04494, + "grad_norm": 0.5509029147435317, + "learning_rate": 0.003, + "loss": 4.1386, + "step": 4494 + }, + { + "epoch": 0.04495, + "grad_norm": 0.5187917488469639, + "learning_rate": 0.003, + "loss": 4.1367, + "step": 4495 + }, + { + "epoch": 0.04496, + "grad_norm": 0.530988798745176, + "learning_rate": 0.003, + "loss": 4.1746, + "step": 4496 + }, + { + "epoch": 0.04497, + "grad_norm": 0.5020112887190497, + "learning_rate": 0.003, + "loss": 4.1538, + "step": 4497 + }, + { + "epoch": 0.04498, + "grad_norm": 0.5882091094240027, + "learning_rate": 0.003, + "loss": 4.1282, + "step": 4498 + }, + { + "epoch": 0.04499, + "grad_norm": 0.6882774056835383, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 4499 + }, + { + "epoch": 0.045, + "grad_norm": 0.7302113825660921, + "learning_rate": 0.003, + "loss": 4.1396, + "step": 4500 + }, + { + "epoch": 0.04501, + "grad_norm": 0.6431193209544874, + "learning_rate": 0.003, + "loss": 4.1285, + "step": 4501 + }, + { + "epoch": 0.04502, + "grad_norm": 0.6128789446358794, + "learning_rate": 0.003, + "loss": 4.1703, + "step": 4502 + }, + { + "epoch": 0.04503, + "grad_norm": 0.563632556792813, + "learning_rate": 0.003, + "loss": 4.167, + "step": 4503 + }, + { + "epoch": 0.04504, + "grad_norm": 0.49069503580557566, + "learning_rate": 0.003, + "loss": 4.1633, + "step": 4504 + }, + { + "epoch": 0.04505, + "grad_norm": 0.5316897314034865, + "learning_rate": 0.003, + "loss": 4.1583, + "step": 4505 + }, + { + "epoch": 0.04506, + "grad_norm": 0.485539206424938, + "learning_rate": 0.003, + "loss": 4.1369, + "step": 4506 + }, + { + "epoch": 0.04507, + "grad_norm": 0.4651162125504691, + "learning_rate": 0.003, + "loss": 4.109, + "step": 4507 + }, + { + "epoch": 0.04508, + "grad_norm": 0.47136511588197955, + "learning_rate": 0.003, + "loss": 4.1303, + "step": 4508 + }, + { + "epoch": 0.04509, + "grad_norm": 0.48078179210487143, + "learning_rate": 0.003, + "loss": 4.153, + "step": 4509 + }, + { + "epoch": 0.0451, + "grad_norm": 0.5697241941596817, + "learning_rate": 0.003, + "loss": 4.1586, + "step": 4510 + }, + { + "epoch": 0.04511, + "grad_norm": 0.6691571245585908, + "learning_rate": 0.003, + "loss": 4.1463, + "step": 4511 + }, + { + "epoch": 0.04512, + "grad_norm": 0.7515863901593889, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 4512 + }, + { + "epoch": 0.04513, + "grad_norm": 0.7483616185624625, + "learning_rate": 0.003, + "loss": 4.1391, + "step": 4513 + }, + { + "epoch": 0.04514, + "grad_norm": 0.703461108276439, + "learning_rate": 0.003, + "loss": 4.1439, + "step": 4514 + }, + { + "epoch": 0.04515, + "grad_norm": 0.7246314655645058, + "learning_rate": 0.003, + "loss": 4.1418, + "step": 4515 + }, + { + "epoch": 0.04516, + "grad_norm": 0.6368515120508723, + "learning_rate": 0.003, + "loss": 4.1523, + "step": 4516 + }, + { + "epoch": 0.04517, + "grad_norm": 0.5222304173800686, + "learning_rate": 0.003, + "loss": 4.1611, + "step": 4517 + }, + { + "epoch": 0.04518, + "grad_norm": 0.48504759929098307, + "learning_rate": 0.003, + "loss": 4.1337, + "step": 4518 + }, + { + "epoch": 0.04519, + "grad_norm": 0.43996749102619037, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 4519 + }, + { + "epoch": 0.0452, + "grad_norm": 0.4756958760047127, + "learning_rate": 0.003, + "loss": 4.1328, + "step": 4520 + }, + { + "epoch": 0.04521, + "grad_norm": 0.49681842219333233, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 4521 + }, + { + "epoch": 0.04522, + "grad_norm": 0.5177557892182135, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 4522 + }, + { + "epoch": 0.04523, + "grad_norm": 0.4834475000365292, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 4523 + }, + { + "epoch": 0.04524, + "grad_norm": 0.4294821611706401, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 4524 + }, + { + "epoch": 0.04525, + "grad_norm": 0.4784372227541117, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 4525 + }, + { + "epoch": 0.04526, + "grad_norm": 0.5155297039314984, + "learning_rate": 0.003, + "loss": 4.1398, + "step": 4526 + }, + { + "epoch": 0.04527, + "grad_norm": 0.6810222903041584, + "learning_rate": 0.003, + "loss": 4.1255, + "step": 4527 + }, + { + "epoch": 0.04528, + "grad_norm": 0.8822750025167144, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 4528 + }, + { + "epoch": 0.04529, + "grad_norm": 0.8919184923159118, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 4529 + }, + { + "epoch": 0.0453, + "grad_norm": 0.8374148469158906, + "learning_rate": 0.003, + "loss": 4.1265, + "step": 4530 + }, + { + "epoch": 0.04531, + "grad_norm": 0.7926240119332635, + "learning_rate": 0.003, + "loss": 4.1315, + "step": 4531 + }, + { + "epoch": 0.04532, + "grad_norm": 0.8568328376834813, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 4532 + }, + { + "epoch": 0.04533, + "grad_norm": 1.0159060122619772, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 4533 + }, + { + "epoch": 0.04534, + "grad_norm": 0.9536103887238125, + "learning_rate": 0.003, + "loss": 4.1827, + "step": 4534 + }, + { + "epoch": 0.04535, + "grad_norm": 0.9201117164869018, + "learning_rate": 0.003, + "loss": 4.2131, + "step": 4535 + }, + { + "epoch": 0.04536, + "grad_norm": 1.0425882626135783, + "learning_rate": 0.003, + "loss": 4.2164, + "step": 4536 + }, + { + "epoch": 0.04537, + "grad_norm": 1.1171942484625808, + "learning_rate": 0.003, + "loss": 4.2062, + "step": 4537 + }, + { + "epoch": 0.04538, + "grad_norm": 1.095605747573551, + "learning_rate": 0.003, + "loss": 4.2133, + "step": 4538 + }, + { + "epoch": 0.04539, + "grad_norm": 0.9260638010898388, + "learning_rate": 0.003, + "loss": 4.1705, + "step": 4539 + }, + { + "epoch": 0.0454, + "grad_norm": 0.7768806609844044, + "learning_rate": 0.003, + "loss": 4.175, + "step": 4540 + }, + { + "epoch": 0.04541, + "grad_norm": 0.819877783723442, + "learning_rate": 0.003, + "loss": 4.2224, + "step": 4541 + }, + { + "epoch": 0.04542, + "grad_norm": 0.8805829563806863, + "learning_rate": 0.003, + "loss": 4.1752, + "step": 4542 + }, + { + "epoch": 0.04543, + "grad_norm": 0.7923901917703942, + "learning_rate": 0.003, + "loss": 4.207, + "step": 4543 + }, + { + "epoch": 0.04544, + "grad_norm": 0.5941044825763567, + "learning_rate": 0.003, + "loss": 4.1737, + "step": 4544 + }, + { + "epoch": 0.04545, + "grad_norm": 0.6408411687252461, + "learning_rate": 0.003, + "loss": 4.1822, + "step": 4545 + }, + { + "epoch": 0.04546, + "grad_norm": 0.8237308204937738, + "learning_rate": 0.003, + "loss": 4.1724, + "step": 4546 + }, + { + "epoch": 0.04547, + "grad_norm": 0.9235544857511309, + "learning_rate": 0.003, + "loss": 4.1822, + "step": 4547 + }, + { + "epoch": 0.04548, + "grad_norm": 0.7235487434091521, + "learning_rate": 0.003, + "loss": 4.1901, + "step": 4548 + }, + { + "epoch": 0.04549, + "grad_norm": 0.6422371993419722, + "learning_rate": 0.003, + "loss": 4.1732, + "step": 4549 + }, + { + "epoch": 0.0455, + "grad_norm": 0.5600832762072884, + "learning_rate": 0.003, + "loss": 4.1988, + "step": 4550 + }, + { + "epoch": 0.04551, + "grad_norm": 0.5228348539847921, + "learning_rate": 0.003, + "loss": 4.206, + "step": 4551 + }, + { + "epoch": 0.04552, + "grad_norm": 0.4730508563524038, + "learning_rate": 0.003, + "loss": 4.1829, + "step": 4552 + }, + { + "epoch": 0.04553, + "grad_norm": 0.5620543138219484, + "learning_rate": 0.003, + "loss": 4.1698, + "step": 4553 + }, + { + "epoch": 0.04554, + "grad_norm": 0.5765966592222326, + "learning_rate": 0.003, + "loss": 4.1553, + "step": 4554 + }, + { + "epoch": 0.04555, + "grad_norm": 0.577781369783007, + "learning_rate": 0.003, + "loss": 4.1637, + "step": 4555 + }, + { + "epoch": 0.04556, + "grad_norm": 0.5035922467137508, + "learning_rate": 0.003, + "loss": 4.1622, + "step": 4556 + }, + { + "epoch": 0.04557, + "grad_norm": 0.5559102933120925, + "learning_rate": 0.003, + "loss": 4.1442, + "step": 4557 + }, + { + "epoch": 0.04558, + "grad_norm": 0.7462315222900177, + "learning_rate": 0.003, + "loss": 4.1815, + "step": 4558 + }, + { + "epoch": 0.04559, + "grad_norm": 0.7835969334333466, + "learning_rate": 0.003, + "loss": 4.1966, + "step": 4559 + }, + { + "epoch": 0.0456, + "grad_norm": 0.6591837955511258, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 4560 + }, + { + "epoch": 0.04561, + "grad_norm": 0.7187360930779703, + "learning_rate": 0.003, + "loss": 4.1604, + "step": 4561 + }, + { + "epoch": 0.04562, + "grad_norm": 0.7184968771307666, + "learning_rate": 0.003, + "loss": 4.1834, + "step": 4562 + }, + { + "epoch": 0.04563, + "grad_norm": 0.7171157380489706, + "learning_rate": 0.003, + "loss": 4.1497, + "step": 4563 + }, + { + "epoch": 0.04564, + "grad_norm": 0.6631256505733477, + "learning_rate": 0.003, + "loss": 4.1605, + "step": 4564 + }, + { + "epoch": 0.04565, + "grad_norm": 0.5733198686410141, + "learning_rate": 0.003, + "loss": 4.128, + "step": 4565 + }, + { + "epoch": 0.04566, + "grad_norm": 0.4813774167549479, + "learning_rate": 0.003, + "loss": 4.1555, + "step": 4566 + }, + { + "epoch": 0.04567, + "grad_norm": 0.40853415838936963, + "learning_rate": 0.003, + "loss": 4.1494, + "step": 4567 + }, + { + "epoch": 0.04568, + "grad_norm": 0.37364202822845655, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 4568 + }, + { + "epoch": 0.04569, + "grad_norm": 0.3618276444120114, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 4569 + }, + { + "epoch": 0.0457, + "grad_norm": 0.3609968139572239, + "learning_rate": 0.003, + "loss": 4.1362, + "step": 4570 + }, + { + "epoch": 0.04571, + "grad_norm": 0.37747195577489445, + "learning_rate": 0.003, + "loss": 4.1615, + "step": 4571 + }, + { + "epoch": 0.04572, + "grad_norm": 0.3874865960286465, + "learning_rate": 0.003, + "loss": 4.1339, + "step": 4572 + }, + { + "epoch": 0.04573, + "grad_norm": 0.4109072807516651, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 4573 + }, + { + "epoch": 0.04574, + "grad_norm": 0.39179890606589207, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 4574 + }, + { + "epoch": 0.04575, + "grad_norm": 0.4386907207682461, + "learning_rate": 0.003, + "loss": 4.1446, + "step": 4575 + }, + { + "epoch": 0.04576, + "grad_norm": 0.5065967670777235, + "learning_rate": 0.003, + "loss": 4.1355, + "step": 4576 + }, + { + "epoch": 0.04577, + "grad_norm": 0.6143902378642073, + "learning_rate": 0.003, + "loss": 4.1597, + "step": 4577 + }, + { + "epoch": 0.04578, + "grad_norm": 0.6827140793497046, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 4578 + }, + { + "epoch": 0.04579, + "grad_norm": 0.6929677517815375, + "learning_rate": 0.003, + "loss": 4.1119, + "step": 4579 + }, + { + "epoch": 0.0458, + "grad_norm": 0.7109361429436778, + "learning_rate": 0.003, + "loss": 4.15, + "step": 4580 + }, + { + "epoch": 0.04581, + "grad_norm": 0.5770733016902231, + "learning_rate": 0.003, + "loss": 4.1591, + "step": 4581 + }, + { + "epoch": 0.04582, + "grad_norm": 0.4702915512162915, + "learning_rate": 0.003, + "loss": 4.1561, + "step": 4582 + }, + { + "epoch": 0.04583, + "grad_norm": 0.45277658262029463, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 4583 + }, + { + "epoch": 0.04584, + "grad_norm": 0.4519208543100193, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 4584 + }, + { + "epoch": 0.04585, + "grad_norm": 0.4665143329624495, + "learning_rate": 0.003, + "loss": 4.1282, + "step": 4585 + }, + { + "epoch": 0.04586, + "grad_norm": 0.5475838831331661, + "learning_rate": 0.003, + "loss": 4.1339, + "step": 4586 + }, + { + "epoch": 0.04587, + "grad_norm": 0.6898223965554029, + "learning_rate": 0.003, + "loss": 4.1575, + "step": 4587 + }, + { + "epoch": 0.04588, + "grad_norm": 0.7350712744563861, + "learning_rate": 0.003, + "loss": 4.1591, + "step": 4588 + }, + { + "epoch": 0.04589, + "grad_norm": 0.6924903850485442, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 4589 + }, + { + "epoch": 0.0459, + "grad_norm": 0.6678707302029508, + "learning_rate": 0.003, + "loss": 4.1476, + "step": 4590 + }, + { + "epoch": 0.04591, + "grad_norm": 0.6499473571050832, + "learning_rate": 0.003, + "loss": 4.117, + "step": 4591 + }, + { + "epoch": 0.04592, + "grad_norm": 0.6817526881561883, + "learning_rate": 0.003, + "loss": 4.1425, + "step": 4592 + }, + { + "epoch": 0.04593, + "grad_norm": 0.6518777120562186, + "learning_rate": 0.003, + "loss": 4.1644, + "step": 4593 + }, + { + "epoch": 0.04594, + "grad_norm": 0.5747202505666615, + "learning_rate": 0.003, + "loss": 4.1544, + "step": 4594 + }, + { + "epoch": 0.04595, + "grad_norm": 0.5397845618052467, + "learning_rate": 0.003, + "loss": 4.1411, + "step": 4595 + }, + { + "epoch": 0.04596, + "grad_norm": 0.510854997128848, + "learning_rate": 0.003, + "loss": 4.1419, + "step": 4596 + }, + { + "epoch": 0.04597, + "grad_norm": 0.49627082518670596, + "learning_rate": 0.003, + "loss": 4.147, + "step": 4597 + }, + { + "epoch": 0.04598, + "grad_norm": 0.530586170374032, + "learning_rate": 0.003, + "loss": 4.1793, + "step": 4598 + }, + { + "epoch": 0.04599, + "grad_norm": 0.48072108059262636, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 4599 + }, + { + "epoch": 0.046, + "grad_norm": 0.4162968401651856, + "learning_rate": 0.003, + "loss": 4.1362, + "step": 4600 + }, + { + "epoch": 0.04601, + "grad_norm": 0.4060240320878838, + "learning_rate": 0.003, + "loss": 4.1383, + "step": 4601 + }, + { + "epoch": 0.04602, + "grad_norm": 0.4911601212276765, + "learning_rate": 0.003, + "loss": 4.1333, + "step": 4602 + }, + { + "epoch": 0.04603, + "grad_norm": 0.5112302821705641, + "learning_rate": 0.003, + "loss": 4.1456, + "step": 4603 + }, + { + "epoch": 0.04604, + "grad_norm": 0.5642182423159343, + "learning_rate": 0.003, + "loss": 4.146, + "step": 4604 + }, + { + "epoch": 0.04605, + "grad_norm": 0.6193357979940537, + "learning_rate": 0.003, + "loss": 4.1581, + "step": 4605 + }, + { + "epoch": 0.04606, + "grad_norm": 0.7047357474337432, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 4606 + }, + { + "epoch": 0.04607, + "grad_norm": 0.782876195240902, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 4607 + }, + { + "epoch": 0.04608, + "grad_norm": 0.6985321303741415, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 4608 + }, + { + "epoch": 0.04609, + "grad_norm": 0.49949454925317704, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 4609 + }, + { + "epoch": 0.0461, + "grad_norm": 0.4635494280797984, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 4610 + }, + { + "epoch": 0.04611, + "grad_norm": 0.4523573678413289, + "learning_rate": 0.003, + "loss": 4.1611, + "step": 4611 + }, + { + "epoch": 0.04612, + "grad_norm": 0.5042320328946598, + "learning_rate": 0.003, + "loss": 4.143, + "step": 4612 + }, + { + "epoch": 0.04613, + "grad_norm": 0.5210913810336651, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 4613 + }, + { + "epoch": 0.04614, + "grad_norm": 0.6245444112274343, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 4614 + }, + { + "epoch": 0.04615, + "grad_norm": 0.6535761170229347, + "learning_rate": 0.003, + "loss": 4.138, + "step": 4615 + }, + { + "epoch": 0.04616, + "grad_norm": 0.6420802226733096, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 4616 + }, + { + "epoch": 0.04617, + "grad_norm": 0.5687953058470812, + "learning_rate": 0.003, + "loss": 4.1442, + "step": 4617 + }, + { + "epoch": 0.04618, + "grad_norm": 0.5083532285672069, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 4618 + }, + { + "epoch": 0.04619, + "grad_norm": 0.5265907169405639, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 4619 + }, + { + "epoch": 0.0462, + "grad_norm": 0.5890366123582629, + "learning_rate": 0.003, + "loss": 4.155, + "step": 4620 + }, + { + "epoch": 0.04621, + "grad_norm": 0.6504323601577833, + "learning_rate": 0.003, + "loss": 4.165, + "step": 4621 + }, + { + "epoch": 0.04622, + "grad_norm": 0.6707598130254062, + "learning_rate": 0.003, + "loss": 4.1612, + "step": 4622 + }, + { + "epoch": 0.04623, + "grad_norm": 0.6302847440276245, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 4623 + }, + { + "epoch": 0.04624, + "grad_norm": 0.6466984839849552, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 4624 + }, + { + "epoch": 0.04625, + "grad_norm": 0.6909336873679446, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 4625 + }, + { + "epoch": 0.04626, + "grad_norm": 0.6100952148809866, + "learning_rate": 0.003, + "loss": 4.146, + "step": 4626 + }, + { + "epoch": 0.04627, + "grad_norm": 0.54274307554158, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 4627 + }, + { + "epoch": 0.04628, + "grad_norm": 0.5405464195031551, + "learning_rate": 0.003, + "loss": 4.1564, + "step": 4628 + }, + { + "epoch": 0.04629, + "grad_norm": 0.5457077621265701, + "learning_rate": 0.003, + "loss": 4.1313, + "step": 4629 + }, + { + "epoch": 0.0463, + "grad_norm": 0.5985608016616293, + "learning_rate": 0.003, + "loss": 4.1464, + "step": 4630 + }, + { + "epoch": 0.04631, + "grad_norm": 0.6821487067722284, + "learning_rate": 0.003, + "loss": 4.1345, + "step": 4631 + }, + { + "epoch": 0.04632, + "grad_norm": 0.7372265744416058, + "learning_rate": 0.003, + "loss": 4.1302, + "step": 4632 + }, + { + "epoch": 0.04633, + "grad_norm": 0.8415550301042956, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 4633 + }, + { + "epoch": 0.04634, + "grad_norm": 0.8373989629700342, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 4634 + }, + { + "epoch": 0.04635, + "grad_norm": 0.7581438335563392, + "learning_rate": 0.003, + "loss": 4.1486, + "step": 4635 + }, + { + "epoch": 0.04636, + "grad_norm": 0.7125351605332876, + "learning_rate": 0.003, + "loss": 4.183, + "step": 4636 + }, + { + "epoch": 0.04637, + "grad_norm": 0.7232757442184159, + "learning_rate": 0.003, + "loss": 4.1282, + "step": 4637 + }, + { + "epoch": 0.04638, + "grad_norm": 0.6757057573333891, + "learning_rate": 0.003, + "loss": 4.1632, + "step": 4638 + }, + { + "epoch": 0.04639, + "grad_norm": 0.719912413703624, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 4639 + }, + { + "epoch": 0.0464, + "grad_norm": 0.7428965743218653, + "learning_rate": 0.003, + "loss": 4.1606, + "step": 4640 + }, + { + "epoch": 0.04641, + "grad_norm": 0.8598211728393256, + "learning_rate": 0.003, + "loss": 4.1798, + "step": 4641 + }, + { + "epoch": 0.04642, + "grad_norm": 0.8907386217362441, + "learning_rate": 0.003, + "loss": 4.1302, + "step": 4642 + }, + { + "epoch": 0.04643, + "grad_norm": 0.7795644135221422, + "learning_rate": 0.003, + "loss": 4.152, + "step": 4643 + }, + { + "epoch": 0.04644, + "grad_norm": 0.5620211951242756, + "learning_rate": 0.003, + "loss": 4.1487, + "step": 4644 + }, + { + "epoch": 0.04645, + "grad_norm": 0.5137991034790863, + "learning_rate": 0.003, + "loss": 4.1401, + "step": 4645 + }, + { + "epoch": 0.04646, + "grad_norm": 0.5506672519917369, + "learning_rate": 0.003, + "loss": 4.1418, + "step": 4646 + }, + { + "epoch": 0.04647, + "grad_norm": 0.545899380374656, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 4647 + }, + { + "epoch": 0.04648, + "grad_norm": 0.5926193733697508, + "learning_rate": 0.003, + "loss": 4.1583, + "step": 4648 + }, + { + "epoch": 0.04649, + "grad_norm": 0.6418478720978149, + "learning_rate": 0.003, + "loss": 4.1419, + "step": 4649 + }, + { + "epoch": 0.0465, + "grad_norm": 0.6971029121589252, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 4650 + }, + { + "epoch": 0.04651, + "grad_norm": 0.7128225140702158, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 4651 + }, + { + "epoch": 0.04652, + "grad_norm": 0.6817984613215538, + "learning_rate": 0.003, + "loss": 4.1556, + "step": 4652 + }, + { + "epoch": 0.04653, + "grad_norm": 0.6586749661533192, + "learning_rate": 0.003, + "loss": 4.157, + "step": 4653 + }, + { + "epoch": 0.04654, + "grad_norm": 0.5413982579807713, + "learning_rate": 0.003, + "loss": 4.1451, + "step": 4654 + }, + { + "epoch": 0.04655, + "grad_norm": 0.4819381736457758, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 4655 + }, + { + "epoch": 0.04656, + "grad_norm": 0.5090740776020435, + "learning_rate": 0.003, + "loss": 4.12, + "step": 4656 + }, + { + "epoch": 0.04657, + "grad_norm": 0.4529973940302403, + "learning_rate": 0.003, + "loss": 4.1551, + "step": 4657 + }, + { + "epoch": 0.04658, + "grad_norm": 0.3863158282687969, + "learning_rate": 0.003, + "loss": 4.1262, + "step": 4658 + }, + { + "epoch": 0.04659, + "grad_norm": 0.3846174175293322, + "learning_rate": 0.003, + "loss": 4.1262, + "step": 4659 + }, + { + "epoch": 0.0466, + "grad_norm": 0.33629970185853586, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 4660 + }, + { + "epoch": 0.04661, + "grad_norm": 0.3733824492556284, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 4661 + }, + { + "epoch": 0.04662, + "grad_norm": 0.40706146799826654, + "learning_rate": 0.003, + "loss": 4.1152, + "step": 4662 + }, + { + "epoch": 0.04663, + "grad_norm": 0.4624049284667894, + "learning_rate": 0.003, + "loss": 4.1355, + "step": 4663 + }, + { + "epoch": 0.04664, + "grad_norm": 0.6204385926386629, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 4664 + }, + { + "epoch": 0.04665, + "grad_norm": 0.8486297396587145, + "learning_rate": 0.003, + "loss": 4.1261, + "step": 4665 + }, + { + "epoch": 0.04666, + "grad_norm": 0.8396332728949535, + "learning_rate": 0.003, + "loss": 4.128, + "step": 4666 + }, + { + "epoch": 0.04667, + "grad_norm": 0.604029857762682, + "learning_rate": 0.003, + "loss": 4.1204, + "step": 4667 + }, + { + "epoch": 0.04668, + "grad_norm": 0.6187443793003241, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 4668 + }, + { + "epoch": 0.04669, + "grad_norm": 0.671068656091709, + "learning_rate": 0.003, + "loss": 4.1551, + "step": 4669 + }, + { + "epoch": 0.0467, + "grad_norm": 0.582982047763638, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 4670 + }, + { + "epoch": 0.04671, + "grad_norm": 0.5791042512401672, + "learning_rate": 0.003, + "loss": 4.128, + "step": 4671 + }, + { + "epoch": 0.04672, + "grad_norm": 0.5822833904953063, + "learning_rate": 0.003, + "loss": 4.1419, + "step": 4672 + }, + { + "epoch": 0.04673, + "grad_norm": 0.6690185777600851, + "learning_rate": 0.003, + "loss": 4.1511, + "step": 4673 + }, + { + "epoch": 0.04674, + "grad_norm": 0.6501772820492476, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 4674 + }, + { + "epoch": 0.04675, + "grad_norm": 0.5619140983596763, + "learning_rate": 0.003, + "loss": 4.143, + "step": 4675 + }, + { + "epoch": 0.04676, + "grad_norm": 0.4844274130103253, + "learning_rate": 0.003, + "loss": 4.145, + "step": 4676 + }, + { + "epoch": 0.04677, + "grad_norm": 0.46020296554025397, + "learning_rate": 0.003, + "loss": 4.1477, + "step": 4677 + }, + { + "epoch": 0.04678, + "grad_norm": 0.4560378554438905, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 4678 + }, + { + "epoch": 0.04679, + "grad_norm": 0.48237840297163587, + "learning_rate": 0.003, + "loss": 4.1347, + "step": 4679 + }, + { + "epoch": 0.0468, + "grad_norm": 0.545076677673247, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 4680 + }, + { + "epoch": 0.04681, + "grad_norm": 0.6224973549149077, + "learning_rate": 0.003, + "loss": 4.1596, + "step": 4681 + }, + { + "epoch": 0.04682, + "grad_norm": 0.7719478560198243, + "learning_rate": 0.003, + "loss": 4.1344, + "step": 4682 + }, + { + "epoch": 0.04683, + "grad_norm": 0.8064115060114956, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 4683 + }, + { + "epoch": 0.04684, + "grad_norm": 0.8855175249633898, + "learning_rate": 0.003, + "loss": 4.1577, + "step": 4684 + }, + { + "epoch": 0.04685, + "grad_norm": 0.8534062578922067, + "learning_rate": 0.003, + "loss": 4.145, + "step": 4685 + }, + { + "epoch": 0.04686, + "grad_norm": 0.797573707115956, + "learning_rate": 0.003, + "loss": 4.1545, + "step": 4686 + }, + { + "epoch": 0.04687, + "grad_norm": 0.713270606155539, + "learning_rate": 0.003, + "loss": 4.181, + "step": 4687 + }, + { + "epoch": 0.04688, + "grad_norm": 0.6241645872789137, + "learning_rate": 0.003, + "loss": 4.1564, + "step": 4688 + }, + { + "epoch": 0.04689, + "grad_norm": 0.559937415479369, + "learning_rate": 0.003, + "loss": 4.1712, + "step": 4689 + }, + { + "epoch": 0.0469, + "grad_norm": 0.5404635622147993, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 4690 + }, + { + "epoch": 0.04691, + "grad_norm": 0.486653564258482, + "learning_rate": 0.003, + "loss": 4.1318, + "step": 4691 + }, + { + "epoch": 0.04692, + "grad_norm": 0.481468369732997, + "learning_rate": 0.003, + "loss": 4.1351, + "step": 4692 + }, + { + "epoch": 0.04693, + "grad_norm": 0.5011365510399343, + "learning_rate": 0.003, + "loss": 4.1601, + "step": 4693 + }, + { + "epoch": 0.04694, + "grad_norm": 0.48431784153510343, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 4694 + }, + { + "epoch": 0.04695, + "grad_norm": 0.5590735035376782, + "learning_rate": 0.003, + "loss": 4.1121, + "step": 4695 + }, + { + "epoch": 0.04696, + "grad_norm": 0.5991808838927974, + "learning_rate": 0.003, + "loss": 4.1565, + "step": 4696 + }, + { + "epoch": 0.04697, + "grad_norm": 0.6802689730153363, + "learning_rate": 0.003, + "loss": 4.1429, + "step": 4697 + }, + { + "epoch": 0.04698, + "grad_norm": 0.6842952307392385, + "learning_rate": 0.003, + "loss": 4.1463, + "step": 4698 + }, + { + "epoch": 0.04699, + "grad_norm": 0.6034505759619246, + "learning_rate": 0.003, + "loss": 4.1321, + "step": 4699 + }, + { + "epoch": 0.047, + "grad_norm": 0.5907548018053599, + "learning_rate": 0.003, + "loss": 4.1459, + "step": 4700 + }, + { + "epoch": 0.04701, + "grad_norm": 0.5241329554797072, + "learning_rate": 0.003, + "loss": 4.1541, + "step": 4701 + }, + { + "epoch": 0.04702, + "grad_norm": 0.5399206512572382, + "learning_rate": 0.003, + "loss": 4.1446, + "step": 4702 + }, + { + "epoch": 0.04703, + "grad_norm": 0.5629949677153792, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 4703 + }, + { + "epoch": 0.04704, + "grad_norm": 0.4982163977250413, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 4704 + }, + { + "epoch": 0.04705, + "grad_norm": 0.5165399833831362, + "learning_rate": 0.003, + "loss": 4.1559, + "step": 4705 + }, + { + "epoch": 0.04706, + "grad_norm": 0.5588859123289356, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 4706 + }, + { + "epoch": 0.04707, + "grad_norm": 0.6493712508280153, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 4707 + }, + { + "epoch": 0.04708, + "grad_norm": 0.7592099124000022, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 4708 + }, + { + "epoch": 0.04709, + "grad_norm": 0.7643143840544172, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 4709 + }, + { + "epoch": 0.0471, + "grad_norm": 0.7117650865961848, + "learning_rate": 0.003, + "loss": 4.1496, + "step": 4710 + }, + { + "epoch": 0.04711, + "grad_norm": 0.6222639694626878, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 4711 + }, + { + "epoch": 0.04712, + "grad_norm": 0.5676066371017507, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 4712 + }, + { + "epoch": 0.04713, + "grad_norm": 0.5770424755113991, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 4713 + }, + { + "epoch": 0.04714, + "grad_norm": 0.5617675778124301, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 4714 + }, + { + "epoch": 0.04715, + "grad_norm": 0.5635879231709099, + "learning_rate": 0.003, + "loss": 4.1583, + "step": 4715 + }, + { + "epoch": 0.04716, + "grad_norm": 0.5756399495553071, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 4716 + }, + { + "epoch": 0.04717, + "grad_norm": 0.5516676091531315, + "learning_rate": 0.003, + "loss": 4.146, + "step": 4717 + }, + { + "epoch": 0.04718, + "grad_norm": 0.5287651595406879, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 4718 + }, + { + "epoch": 0.04719, + "grad_norm": 0.5253847377704106, + "learning_rate": 0.003, + "loss": 4.1173, + "step": 4719 + }, + { + "epoch": 0.0472, + "grad_norm": 0.5350874385832446, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 4720 + }, + { + "epoch": 0.04721, + "grad_norm": 0.6004531613918229, + "learning_rate": 0.003, + "loss": 4.1115, + "step": 4721 + }, + { + "epoch": 0.04722, + "grad_norm": 0.7164746962025985, + "learning_rate": 0.003, + "loss": 4.1357, + "step": 4722 + }, + { + "epoch": 0.04723, + "grad_norm": 0.7305093269163504, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 4723 + }, + { + "epoch": 0.04724, + "grad_norm": 0.6386395939306334, + "learning_rate": 0.003, + "loss": 4.1164, + "step": 4724 + }, + { + "epoch": 0.04725, + "grad_norm": 0.5477055183445729, + "learning_rate": 0.003, + "loss": 4.1373, + "step": 4725 + }, + { + "epoch": 0.04726, + "grad_norm": 0.6698851087894558, + "learning_rate": 0.003, + "loss": 4.1328, + "step": 4726 + }, + { + "epoch": 0.04727, + "grad_norm": 0.6952010909817607, + "learning_rate": 0.003, + "loss": 4.121, + "step": 4727 + }, + { + "epoch": 0.04728, + "grad_norm": 0.6237561354472855, + "learning_rate": 0.003, + "loss": 4.1242, + "step": 4728 + }, + { + "epoch": 0.04729, + "grad_norm": 0.5824726583084653, + "learning_rate": 0.003, + "loss": 4.1416, + "step": 4729 + }, + { + "epoch": 0.0473, + "grad_norm": 0.6234484457938285, + "learning_rate": 0.003, + "loss": 4.1396, + "step": 4730 + }, + { + "epoch": 0.04731, + "grad_norm": 0.7040984462948884, + "learning_rate": 0.003, + "loss": 4.1364, + "step": 4731 + }, + { + "epoch": 0.04732, + "grad_norm": 0.8060281611564052, + "learning_rate": 0.003, + "loss": 4.1555, + "step": 4732 + }, + { + "epoch": 0.04733, + "grad_norm": 0.8908771747222946, + "learning_rate": 0.003, + "loss": 4.1476, + "step": 4733 + }, + { + "epoch": 0.04734, + "grad_norm": 0.8915755383205525, + "learning_rate": 0.003, + "loss": 4.1618, + "step": 4734 + }, + { + "epoch": 0.04735, + "grad_norm": 0.8597675990151213, + "learning_rate": 0.003, + "loss": 4.1801, + "step": 4735 + }, + { + "epoch": 0.04736, + "grad_norm": 0.9421924904802637, + "learning_rate": 0.003, + "loss": 4.1714, + "step": 4736 + }, + { + "epoch": 0.04737, + "grad_norm": 0.9723514275784639, + "learning_rate": 0.003, + "loss": 4.1406, + "step": 4737 + }, + { + "epoch": 0.04738, + "grad_norm": 0.8765529889726066, + "learning_rate": 0.003, + "loss": 4.1853, + "step": 4738 + }, + { + "epoch": 0.04739, + "grad_norm": 0.6891707628164019, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 4739 + }, + { + "epoch": 0.0474, + "grad_norm": 0.6320172894482899, + "learning_rate": 0.003, + "loss": 4.1783, + "step": 4740 + }, + { + "epoch": 0.04741, + "grad_norm": 0.6185420901083515, + "learning_rate": 0.003, + "loss": 4.1667, + "step": 4741 + }, + { + "epoch": 0.04742, + "grad_norm": 0.5743911871453854, + "learning_rate": 0.003, + "loss": 4.1474, + "step": 4742 + }, + { + "epoch": 0.04743, + "grad_norm": 0.5220289188801445, + "learning_rate": 0.003, + "loss": 4.1652, + "step": 4743 + }, + { + "epoch": 0.04744, + "grad_norm": 0.509105103411712, + "learning_rate": 0.003, + "loss": 4.1261, + "step": 4744 + }, + { + "epoch": 0.04745, + "grad_norm": 0.47978510769114535, + "learning_rate": 0.003, + "loss": 4.1279, + "step": 4745 + }, + { + "epoch": 0.04746, + "grad_norm": 0.5254663892408653, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 4746 + }, + { + "epoch": 0.04747, + "grad_norm": 0.5939317340288737, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 4747 + }, + { + "epoch": 0.04748, + "grad_norm": 0.5634019114968404, + "learning_rate": 0.003, + "loss": 4.1662, + "step": 4748 + }, + { + "epoch": 0.04749, + "grad_norm": 0.5497840563348653, + "learning_rate": 0.003, + "loss": 4.1719, + "step": 4749 + }, + { + "epoch": 0.0475, + "grad_norm": 0.5714725339017903, + "learning_rate": 0.003, + "loss": 4.1375, + "step": 4750 + }, + { + "epoch": 0.04751, + "grad_norm": 0.6084741881095581, + "learning_rate": 0.003, + "loss": 4.103, + "step": 4751 + }, + { + "epoch": 0.04752, + "grad_norm": 0.6608722355986122, + "learning_rate": 0.003, + "loss": 4.1375, + "step": 4752 + }, + { + "epoch": 0.04753, + "grad_norm": 0.7109251785941167, + "learning_rate": 0.003, + "loss": 4.1449, + "step": 4753 + }, + { + "epoch": 0.04754, + "grad_norm": 0.6628914079263116, + "learning_rate": 0.003, + "loss": 4.1659, + "step": 4754 + }, + { + "epoch": 0.04755, + "grad_norm": 0.5644082779422211, + "learning_rate": 0.003, + "loss": 4.139, + "step": 4755 + }, + { + "epoch": 0.04756, + "grad_norm": 0.5287140638812969, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 4756 + }, + { + "epoch": 0.04757, + "grad_norm": 0.49259530652550815, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 4757 + }, + { + "epoch": 0.04758, + "grad_norm": 0.5614284037719612, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 4758 + }, + { + "epoch": 0.04759, + "grad_norm": 0.5959601501901447, + "learning_rate": 0.003, + "loss": 4.1404, + "step": 4759 + }, + { + "epoch": 0.0476, + "grad_norm": 0.6904403583130148, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 4760 + }, + { + "epoch": 0.04761, + "grad_norm": 0.7246423353646352, + "learning_rate": 0.003, + "loss": 4.1311, + "step": 4761 + }, + { + "epoch": 0.04762, + "grad_norm": 0.6915319453958364, + "learning_rate": 0.003, + "loss": 4.1299, + "step": 4762 + }, + { + "epoch": 0.04763, + "grad_norm": 0.6256527340482957, + "learning_rate": 0.003, + "loss": 4.1269, + "step": 4763 + }, + { + "epoch": 0.04764, + "grad_norm": 0.6439794931734228, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 4764 + }, + { + "epoch": 0.04765, + "grad_norm": 0.6077799243331904, + "learning_rate": 0.003, + "loss": 4.1654, + "step": 4765 + }, + { + "epoch": 0.04766, + "grad_norm": 0.5987824824691682, + "learning_rate": 0.003, + "loss": 4.1407, + "step": 4766 + }, + { + "epoch": 0.04767, + "grad_norm": 0.5323630132045762, + "learning_rate": 0.003, + "loss": 4.1343, + "step": 4767 + }, + { + "epoch": 0.04768, + "grad_norm": 0.49097893919403107, + "learning_rate": 0.003, + "loss": 4.1268, + "step": 4768 + }, + { + "epoch": 0.04769, + "grad_norm": 0.41538834375199724, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 4769 + }, + { + "epoch": 0.0477, + "grad_norm": 0.4062286279109976, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 4770 + }, + { + "epoch": 0.04771, + "grad_norm": 0.3969385937952626, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 4771 + }, + { + "epoch": 0.04772, + "grad_norm": 0.47881496679265567, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 4772 + }, + { + "epoch": 0.04773, + "grad_norm": 0.5857337921957999, + "learning_rate": 0.003, + "loss": 4.1524, + "step": 4773 + }, + { + "epoch": 0.04774, + "grad_norm": 0.6545046565097566, + "learning_rate": 0.003, + "loss": 4.1399, + "step": 4774 + }, + { + "epoch": 0.04775, + "grad_norm": 0.6909582502342425, + "learning_rate": 0.003, + "loss": 4.1388, + "step": 4775 + }, + { + "epoch": 0.04776, + "grad_norm": 0.6830425397433746, + "learning_rate": 0.003, + "loss": 4.1266, + "step": 4776 + }, + { + "epoch": 0.04777, + "grad_norm": 0.5407062720120711, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 4777 + }, + { + "epoch": 0.04778, + "grad_norm": 0.4928003344996406, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 4778 + }, + { + "epoch": 0.04779, + "grad_norm": 0.5540563562005282, + "learning_rate": 0.003, + "loss": 4.137, + "step": 4779 + }, + { + "epoch": 0.0478, + "grad_norm": 0.5919870365085376, + "learning_rate": 0.003, + "loss": 4.141, + "step": 4780 + }, + { + "epoch": 0.04781, + "grad_norm": 0.6502125533927232, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 4781 + }, + { + "epoch": 0.04782, + "grad_norm": 0.5583168452423385, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 4782 + }, + { + "epoch": 0.04783, + "grad_norm": 0.49694295687820944, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 4783 + }, + { + "epoch": 0.04784, + "grad_norm": 0.4979847074086822, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 4784 + }, + { + "epoch": 0.04785, + "grad_norm": 0.5392917734790091, + "learning_rate": 0.003, + "loss": 4.1386, + "step": 4785 + }, + { + "epoch": 0.04786, + "grad_norm": 0.5426680689275899, + "learning_rate": 0.003, + "loss": 4.1409, + "step": 4786 + }, + { + "epoch": 0.04787, + "grad_norm": 0.5518443356995324, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 4787 + }, + { + "epoch": 0.04788, + "grad_norm": 0.6334713170503672, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 4788 + }, + { + "epoch": 0.04789, + "grad_norm": 0.6443157992564054, + "learning_rate": 0.003, + "loss": 4.109, + "step": 4789 + }, + { + "epoch": 0.0479, + "grad_norm": 0.6616151594742191, + "learning_rate": 0.003, + "loss": 4.127, + "step": 4790 + }, + { + "epoch": 0.04791, + "grad_norm": 0.7664115436248651, + "learning_rate": 0.003, + "loss": 4.1518, + "step": 4791 + }, + { + "epoch": 0.04792, + "grad_norm": 0.8885911114335623, + "learning_rate": 0.003, + "loss": 4.1663, + "step": 4792 + }, + { + "epoch": 0.04793, + "grad_norm": 1.0730053893316884, + "learning_rate": 0.003, + "loss": 4.1418, + "step": 4793 + }, + { + "epoch": 0.04794, + "grad_norm": 0.7779447364165427, + "learning_rate": 0.003, + "loss": 4.1567, + "step": 4794 + }, + { + "epoch": 0.04795, + "grad_norm": 0.588536399495119, + "learning_rate": 0.003, + "loss": 4.147, + "step": 4795 + }, + { + "epoch": 0.04796, + "grad_norm": 0.68568663004306, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 4796 + }, + { + "epoch": 0.04797, + "grad_norm": 0.681720094933613, + "learning_rate": 0.003, + "loss": 4.1545, + "step": 4797 + }, + { + "epoch": 0.04798, + "grad_norm": 0.7679505695308819, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 4798 + }, + { + "epoch": 0.04799, + "grad_norm": 0.7317665379567332, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 4799 + }, + { + "epoch": 0.048, + "grad_norm": 0.6100825656074705, + "learning_rate": 0.003, + "loss": 4.1333, + "step": 4800 + }, + { + "epoch": 0.04801, + "grad_norm": 0.5012414047123679, + "learning_rate": 0.003, + "loss": 4.1386, + "step": 4801 + }, + { + "epoch": 0.04802, + "grad_norm": 0.4479983336578663, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 4802 + }, + { + "epoch": 0.04803, + "grad_norm": 0.4184616004474593, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 4803 + }, + { + "epoch": 0.04804, + "grad_norm": 0.41305580441153505, + "learning_rate": 0.003, + "loss": 4.147, + "step": 4804 + }, + { + "epoch": 0.04805, + "grad_norm": 0.4398073604549804, + "learning_rate": 0.003, + "loss": 4.1451, + "step": 4805 + }, + { + "epoch": 0.04806, + "grad_norm": 0.42743235225805976, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 4806 + }, + { + "epoch": 0.04807, + "grad_norm": 0.4727115258160579, + "learning_rate": 0.003, + "loss": 4.1438, + "step": 4807 + }, + { + "epoch": 0.04808, + "grad_norm": 0.5919755301104812, + "learning_rate": 0.003, + "loss": 4.126, + "step": 4808 + }, + { + "epoch": 0.04809, + "grad_norm": 0.7289693894093642, + "learning_rate": 0.003, + "loss": 4.1239, + "step": 4809 + }, + { + "epoch": 0.0481, + "grad_norm": 0.7839859175085296, + "learning_rate": 0.003, + "loss": 4.1511, + "step": 4810 + }, + { + "epoch": 0.04811, + "grad_norm": 0.6884295236179085, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 4811 + }, + { + "epoch": 0.04812, + "grad_norm": 0.7352119188470826, + "learning_rate": 0.003, + "loss": 4.1488, + "step": 4812 + }, + { + "epoch": 0.04813, + "grad_norm": 0.695166473264558, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 4813 + }, + { + "epoch": 0.04814, + "grad_norm": 0.6553813909063907, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 4814 + }, + { + "epoch": 0.04815, + "grad_norm": 0.5872101085364063, + "learning_rate": 0.003, + "loss": 4.1306, + "step": 4815 + }, + { + "epoch": 0.04816, + "grad_norm": 0.6233113378649032, + "learning_rate": 0.003, + "loss": 4.1404, + "step": 4816 + }, + { + "epoch": 0.04817, + "grad_norm": 0.6150064661388728, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 4817 + }, + { + "epoch": 0.04818, + "grad_norm": 0.5836692065444768, + "learning_rate": 0.003, + "loss": 4.1284, + "step": 4818 + }, + { + "epoch": 0.04819, + "grad_norm": 0.66525230376135, + "learning_rate": 0.003, + "loss": 4.138, + "step": 4819 + }, + { + "epoch": 0.0482, + "grad_norm": 0.734508856599843, + "learning_rate": 0.003, + "loss": 4.131, + "step": 4820 + }, + { + "epoch": 0.04821, + "grad_norm": 0.7602706927514201, + "learning_rate": 0.003, + "loss": 4.1636, + "step": 4821 + }, + { + "epoch": 0.04822, + "grad_norm": 0.7876820121443141, + "learning_rate": 0.003, + "loss": 4.1439, + "step": 4822 + }, + { + "epoch": 0.04823, + "grad_norm": 0.7863376722170511, + "learning_rate": 0.003, + "loss": 4.1376, + "step": 4823 + }, + { + "epoch": 0.04824, + "grad_norm": 0.8763060513910201, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 4824 + }, + { + "epoch": 0.04825, + "grad_norm": 0.8932088904498784, + "learning_rate": 0.003, + "loss": 4.1501, + "step": 4825 + }, + { + "epoch": 0.04826, + "grad_norm": 0.7974932345630039, + "learning_rate": 0.003, + "loss": 4.1546, + "step": 4826 + }, + { + "epoch": 0.04827, + "grad_norm": 0.787269582726668, + "learning_rate": 0.003, + "loss": 4.1569, + "step": 4827 + }, + { + "epoch": 0.04828, + "grad_norm": 0.8334258319646332, + "learning_rate": 0.003, + "loss": 4.1713, + "step": 4828 + }, + { + "epoch": 0.04829, + "grad_norm": 0.8348604948088247, + "learning_rate": 0.003, + "loss": 4.1556, + "step": 4829 + }, + { + "epoch": 0.0483, + "grad_norm": 0.8624019255671944, + "learning_rate": 0.003, + "loss": 4.1718, + "step": 4830 + }, + { + "epoch": 0.04831, + "grad_norm": 0.9982066722626229, + "learning_rate": 0.003, + "loss": 4.1513, + "step": 4831 + }, + { + "epoch": 0.04832, + "grad_norm": 0.7923904912144631, + "learning_rate": 0.003, + "loss": 4.1652, + "step": 4832 + }, + { + "epoch": 0.04833, + "grad_norm": 0.7754468674602506, + "learning_rate": 0.003, + "loss": 4.173, + "step": 4833 + }, + { + "epoch": 0.04834, + "grad_norm": 0.6518452811577893, + "learning_rate": 0.003, + "loss": 4.134, + "step": 4834 + }, + { + "epoch": 0.04835, + "grad_norm": 0.7302318791564857, + "learning_rate": 0.003, + "loss": 4.1796, + "step": 4835 + }, + { + "epoch": 0.04836, + "grad_norm": 0.6773722956545634, + "learning_rate": 0.003, + "loss": 4.146, + "step": 4836 + }, + { + "epoch": 0.04837, + "grad_norm": 0.5824847865414027, + "learning_rate": 0.003, + "loss": 4.1626, + "step": 4837 + }, + { + "epoch": 0.04838, + "grad_norm": 0.5995229116547709, + "learning_rate": 0.003, + "loss": 4.1738, + "step": 4838 + }, + { + "epoch": 0.04839, + "grad_norm": 0.6433667604247222, + "learning_rate": 0.003, + "loss": 4.1595, + "step": 4839 + }, + { + "epoch": 0.0484, + "grad_norm": 0.6015686841024611, + "learning_rate": 0.003, + "loss": 4.1606, + "step": 4840 + }, + { + "epoch": 0.04841, + "grad_norm": 0.513992549739537, + "learning_rate": 0.003, + "loss": 4.1528, + "step": 4841 + }, + { + "epoch": 0.04842, + "grad_norm": 0.5245975343951695, + "learning_rate": 0.003, + "loss": 4.168, + "step": 4842 + }, + { + "epoch": 0.04843, + "grad_norm": 0.48743719353902437, + "learning_rate": 0.003, + "loss": 4.119, + "step": 4843 + }, + { + "epoch": 0.04844, + "grad_norm": 0.4307867246160189, + "learning_rate": 0.003, + "loss": 4.1464, + "step": 4844 + }, + { + "epoch": 0.04845, + "grad_norm": 0.3751006510902314, + "learning_rate": 0.003, + "loss": 4.1298, + "step": 4845 + }, + { + "epoch": 0.04846, + "grad_norm": 0.39953129595847076, + "learning_rate": 0.003, + "loss": 4.1491, + "step": 4846 + }, + { + "epoch": 0.04847, + "grad_norm": 0.36070808802461135, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 4847 + }, + { + "epoch": 0.04848, + "grad_norm": 0.39594455872525836, + "learning_rate": 0.003, + "loss": 4.1519, + "step": 4848 + }, + { + "epoch": 0.04849, + "grad_norm": 0.38000994045173514, + "learning_rate": 0.003, + "loss": 4.1425, + "step": 4849 + }, + { + "epoch": 0.0485, + "grad_norm": 0.4205813881723009, + "learning_rate": 0.003, + "loss": 4.1319, + "step": 4850 + }, + { + "epoch": 0.04851, + "grad_norm": 0.4045018073402591, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 4851 + }, + { + "epoch": 0.04852, + "grad_norm": 0.37568412955203784, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 4852 + }, + { + "epoch": 0.04853, + "grad_norm": 0.3455978343249278, + "learning_rate": 0.003, + "loss": 4.1342, + "step": 4853 + }, + { + "epoch": 0.04854, + "grad_norm": 0.3952113128712618, + "learning_rate": 0.003, + "loss": 4.1308, + "step": 4854 + }, + { + "epoch": 0.04855, + "grad_norm": 0.5622316469812096, + "learning_rate": 0.003, + "loss": 4.1429, + "step": 4855 + }, + { + "epoch": 0.04856, + "grad_norm": 0.8797178488644849, + "learning_rate": 0.003, + "loss": 4.1514, + "step": 4856 + }, + { + "epoch": 0.04857, + "grad_norm": 1.1335123213922318, + "learning_rate": 0.003, + "loss": 4.1859, + "step": 4857 + }, + { + "epoch": 0.04858, + "grad_norm": 0.6147110414360265, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 4858 + }, + { + "epoch": 0.04859, + "grad_norm": 0.6973892953945595, + "learning_rate": 0.003, + "loss": 4.1734, + "step": 4859 + }, + { + "epoch": 0.0486, + "grad_norm": 0.7043154871004238, + "learning_rate": 0.003, + "loss": 4.1524, + "step": 4860 + }, + { + "epoch": 0.04861, + "grad_norm": 0.7327036767884083, + "learning_rate": 0.003, + "loss": 4.1412, + "step": 4861 + }, + { + "epoch": 0.04862, + "grad_norm": 0.712112403189955, + "learning_rate": 0.003, + "loss": 4.1478, + "step": 4862 + }, + { + "epoch": 0.04863, + "grad_norm": 0.7121663551182991, + "learning_rate": 0.003, + "loss": 4.1673, + "step": 4863 + }, + { + "epoch": 0.04864, + "grad_norm": 0.6310157347751136, + "learning_rate": 0.003, + "loss": 4.1417, + "step": 4864 + }, + { + "epoch": 0.04865, + "grad_norm": 0.6082126733812362, + "learning_rate": 0.003, + "loss": 4.1569, + "step": 4865 + }, + { + "epoch": 0.04866, + "grad_norm": 0.569379004286016, + "learning_rate": 0.003, + "loss": 4.1621, + "step": 4866 + }, + { + "epoch": 0.04867, + "grad_norm": 0.543390213497262, + "learning_rate": 0.003, + "loss": 4.1275, + "step": 4867 + }, + { + "epoch": 0.04868, + "grad_norm": 0.5636989030319737, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 4868 + }, + { + "epoch": 0.04869, + "grad_norm": 0.5250393396952697, + "learning_rate": 0.003, + "loss": 4.1459, + "step": 4869 + }, + { + "epoch": 0.0487, + "grad_norm": 0.4712102787061369, + "learning_rate": 0.003, + "loss": 4.1341, + "step": 4870 + }, + { + "epoch": 0.04871, + "grad_norm": 0.4137409461601053, + "learning_rate": 0.003, + "loss": 4.1494, + "step": 4871 + }, + { + "epoch": 0.04872, + "grad_norm": 0.34190339505881295, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 4872 + }, + { + "epoch": 0.04873, + "grad_norm": 0.3786358077114681, + "learning_rate": 0.003, + "loss": 4.138, + "step": 4873 + }, + { + "epoch": 0.04874, + "grad_norm": 0.3569458423955174, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 4874 + }, + { + "epoch": 0.04875, + "grad_norm": 0.34447042337429823, + "learning_rate": 0.003, + "loss": 4.119, + "step": 4875 + }, + { + "epoch": 0.04876, + "grad_norm": 0.40030916663729343, + "learning_rate": 0.003, + "loss": 4.121, + "step": 4876 + }, + { + "epoch": 0.04877, + "grad_norm": 0.4592360630373684, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 4877 + }, + { + "epoch": 0.04878, + "grad_norm": 0.47334555855094895, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 4878 + }, + { + "epoch": 0.04879, + "grad_norm": 0.4994212755863587, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 4879 + }, + { + "epoch": 0.0488, + "grad_norm": 0.5513466708644019, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 4880 + }, + { + "epoch": 0.04881, + "grad_norm": 0.6373315380482077, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 4881 + }, + { + "epoch": 0.04882, + "grad_norm": 0.6684216804530342, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 4882 + }, + { + "epoch": 0.04883, + "grad_norm": 0.5368064523422058, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 4883 + }, + { + "epoch": 0.04884, + "grad_norm": 0.5972330760546537, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 4884 + }, + { + "epoch": 0.04885, + "grad_norm": 0.8933715601802283, + "learning_rate": 0.003, + "loss": 4.1427, + "step": 4885 + }, + { + "epoch": 0.04886, + "grad_norm": 0.898355015821335, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 4886 + }, + { + "epoch": 0.04887, + "grad_norm": 0.7173276903905438, + "learning_rate": 0.003, + "loss": 4.1481, + "step": 4887 + }, + { + "epoch": 0.04888, + "grad_norm": 0.6726150274910551, + "learning_rate": 0.003, + "loss": 4.1516, + "step": 4888 + }, + { + "epoch": 0.04889, + "grad_norm": 0.6876961916754174, + "learning_rate": 0.003, + "loss": 4.1306, + "step": 4889 + }, + { + "epoch": 0.0489, + "grad_norm": 0.6584901279572161, + "learning_rate": 0.003, + "loss": 4.1409, + "step": 4890 + }, + { + "epoch": 0.04891, + "grad_norm": 0.5884964182869652, + "learning_rate": 0.003, + "loss": 4.1477, + "step": 4891 + }, + { + "epoch": 0.04892, + "grad_norm": 0.7007131660081303, + "learning_rate": 0.003, + "loss": 4.1497, + "step": 4892 + }, + { + "epoch": 0.04893, + "grad_norm": 0.856434726166467, + "learning_rate": 0.003, + "loss": 4.1573, + "step": 4893 + }, + { + "epoch": 0.04894, + "grad_norm": 0.8623969209321906, + "learning_rate": 0.003, + "loss": 4.1724, + "step": 4894 + }, + { + "epoch": 0.04895, + "grad_norm": 0.9386453675933166, + "learning_rate": 0.003, + "loss": 4.1501, + "step": 4895 + }, + { + "epoch": 0.04896, + "grad_norm": 1.0225765758614995, + "learning_rate": 0.003, + "loss": 4.174, + "step": 4896 + }, + { + "epoch": 0.04897, + "grad_norm": 1.0592162822847404, + "learning_rate": 0.003, + "loss": 4.1956, + "step": 4897 + }, + { + "epoch": 0.04898, + "grad_norm": 0.9806770790608599, + "learning_rate": 0.003, + "loss": 4.1844, + "step": 4898 + }, + { + "epoch": 0.04899, + "grad_norm": 1.0087204735328514, + "learning_rate": 0.003, + "loss": 4.1796, + "step": 4899 + }, + { + "epoch": 0.049, + "grad_norm": 0.9807645706509096, + "learning_rate": 0.003, + "loss": 4.2071, + "step": 4900 + }, + { + "epoch": 0.04901, + "grad_norm": 0.8655711245910307, + "learning_rate": 0.003, + "loss": 4.1672, + "step": 4901 + }, + { + "epoch": 0.04902, + "grad_norm": 0.7265751482705578, + "learning_rate": 0.003, + "loss": 4.1956, + "step": 4902 + }, + { + "epoch": 0.04903, + "grad_norm": 0.7751913689828299, + "learning_rate": 0.003, + "loss": 4.1967, + "step": 4903 + }, + { + "epoch": 0.04904, + "grad_norm": 0.791355963573371, + "learning_rate": 0.003, + "loss": 4.1646, + "step": 4904 + }, + { + "epoch": 0.04905, + "grad_norm": 0.7148922976648996, + "learning_rate": 0.003, + "loss": 4.1585, + "step": 4905 + }, + { + "epoch": 0.04906, + "grad_norm": 0.6244001153297727, + "learning_rate": 0.003, + "loss": 4.1667, + "step": 4906 + }, + { + "epoch": 0.04907, + "grad_norm": 0.766532703122461, + "learning_rate": 0.003, + "loss": 4.1521, + "step": 4907 + }, + { + "epoch": 0.04908, + "grad_norm": 0.9239125390916663, + "learning_rate": 0.003, + "loss": 4.1747, + "step": 4908 + }, + { + "epoch": 0.04909, + "grad_norm": 0.8062598058153201, + "learning_rate": 0.003, + "loss": 4.1762, + "step": 4909 + }, + { + "epoch": 0.0491, + "grad_norm": 0.6645500078640559, + "learning_rate": 0.003, + "loss": 4.1636, + "step": 4910 + }, + { + "epoch": 0.04911, + "grad_norm": 0.6285244715158727, + "learning_rate": 0.003, + "loss": 4.1564, + "step": 4911 + }, + { + "epoch": 0.04912, + "grad_norm": 0.6953556617834721, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 4912 + }, + { + "epoch": 0.04913, + "grad_norm": 0.7284223880047193, + "learning_rate": 0.003, + "loss": 4.1577, + "step": 4913 + }, + { + "epoch": 0.04914, + "grad_norm": 0.7207287527598001, + "learning_rate": 0.003, + "loss": 4.165, + "step": 4914 + }, + { + "epoch": 0.04915, + "grad_norm": 0.598886194111445, + "learning_rate": 0.003, + "loss": 4.1661, + "step": 4915 + }, + { + "epoch": 0.04916, + "grad_norm": 0.575625916272145, + "learning_rate": 0.003, + "loss": 4.1602, + "step": 4916 + }, + { + "epoch": 0.04917, + "grad_norm": 0.6136059699261334, + "learning_rate": 0.003, + "loss": 4.117, + "step": 4917 + }, + { + "epoch": 0.04918, + "grad_norm": 0.5509547477979617, + "learning_rate": 0.003, + "loss": 4.1512, + "step": 4918 + }, + { + "epoch": 0.04919, + "grad_norm": 0.4500315004245685, + "learning_rate": 0.003, + "loss": 4.1487, + "step": 4919 + }, + { + "epoch": 0.0492, + "grad_norm": 0.40527549449786815, + "learning_rate": 0.003, + "loss": 4.1467, + "step": 4920 + }, + { + "epoch": 0.04921, + "grad_norm": 0.42108054279033674, + "learning_rate": 0.003, + "loss": 4.1582, + "step": 4921 + }, + { + "epoch": 0.04922, + "grad_norm": 0.4404322304412163, + "learning_rate": 0.003, + "loss": 4.1338, + "step": 4922 + }, + { + "epoch": 0.04923, + "grad_norm": 0.47502761296305623, + "learning_rate": 0.003, + "loss": 4.1115, + "step": 4923 + }, + { + "epoch": 0.04924, + "grad_norm": 0.6308445729709988, + "learning_rate": 0.003, + "loss": 4.1681, + "step": 4924 + }, + { + "epoch": 0.04925, + "grad_norm": 0.8223519029190577, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 4925 + }, + { + "epoch": 0.04926, + "grad_norm": 0.7978760513652026, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 4926 + }, + { + "epoch": 0.04927, + "grad_norm": 0.5978087975120926, + "learning_rate": 0.003, + "loss": 4.1601, + "step": 4927 + }, + { + "epoch": 0.04928, + "grad_norm": 0.6116960538491134, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 4928 + }, + { + "epoch": 0.04929, + "grad_norm": 0.6956029366529681, + "learning_rate": 0.003, + "loss": 4.1413, + "step": 4929 + }, + { + "epoch": 0.0493, + "grad_norm": 0.6193307396741099, + "learning_rate": 0.003, + "loss": 4.1285, + "step": 4930 + }, + { + "epoch": 0.04931, + "grad_norm": 0.5548956756238017, + "learning_rate": 0.003, + "loss": 4.11, + "step": 4931 + }, + { + "epoch": 0.04932, + "grad_norm": 0.5257796103914479, + "learning_rate": 0.003, + "loss": 4.1565, + "step": 4932 + }, + { + "epoch": 0.04933, + "grad_norm": 0.525537631722876, + "learning_rate": 0.003, + "loss": 4.1311, + "step": 4933 + }, + { + "epoch": 0.04934, + "grad_norm": 0.5597601882963287, + "learning_rate": 0.003, + "loss": 4.1608, + "step": 4934 + }, + { + "epoch": 0.04935, + "grad_norm": 0.5183399428365818, + "learning_rate": 0.003, + "loss": 4.155, + "step": 4935 + }, + { + "epoch": 0.04936, + "grad_norm": 0.45976573800777254, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 4936 + }, + { + "epoch": 0.04937, + "grad_norm": 0.4718016863136529, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 4937 + }, + { + "epoch": 0.04938, + "grad_norm": 0.5145237878577654, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 4938 + }, + { + "epoch": 0.04939, + "grad_norm": 0.49376291714918813, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 4939 + }, + { + "epoch": 0.0494, + "grad_norm": 0.40835265558787853, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 4940 + }, + { + "epoch": 0.04941, + "grad_norm": 0.39634154664022514, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 4941 + }, + { + "epoch": 0.04942, + "grad_norm": 0.3697169007385673, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 4942 + }, + { + "epoch": 0.04943, + "grad_norm": 0.3331096726429694, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 4943 + }, + { + "epoch": 0.04944, + "grad_norm": 0.35453569789946104, + "learning_rate": 0.003, + "loss": 4.1443, + "step": 4944 + }, + { + "epoch": 0.04945, + "grad_norm": 0.3239762249913837, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 4945 + }, + { + "epoch": 0.04946, + "grad_norm": 0.31174772171665677, + "learning_rate": 0.003, + "loss": 4.1583, + "step": 4946 + }, + { + "epoch": 0.04947, + "grad_norm": 0.2863836162320785, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 4947 + }, + { + "epoch": 0.04948, + "grad_norm": 0.2929298130193931, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 4948 + }, + { + "epoch": 0.04949, + "grad_norm": 0.3283619905331677, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 4949 + }, + { + "epoch": 0.0495, + "grad_norm": 0.35583655037221473, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 4950 + }, + { + "epoch": 0.04951, + "grad_norm": 0.4306179937269439, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 4951 + }, + { + "epoch": 0.04952, + "grad_norm": 0.5484424313628553, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 4952 + }, + { + "epoch": 0.04953, + "grad_norm": 0.683881072985923, + "learning_rate": 0.003, + "loss": 4.1253, + "step": 4953 + }, + { + "epoch": 0.04954, + "grad_norm": 0.8387871275000114, + "learning_rate": 0.003, + "loss": 4.1538, + "step": 4954 + }, + { + "epoch": 0.04955, + "grad_norm": 1.0719745768190279, + "learning_rate": 0.003, + "loss": 4.142, + "step": 4955 + }, + { + "epoch": 0.04956, + "grad_norm": 1.0142567681805608, + "learning_rate": 0.003, + "loss": 4.1641, + "step": 4956 + }, + { + "epoch": 0.04957, + "grad_norm": 0.7860187886136664, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 4957 + }, + { + "epoch": 0.04958, + "grad_norm": 0.7666806226784963, + "learning_rate": 0.003, + "loss": 4.1464, + "step": 4958 + }, + { + "epoch": 0.04959, + "grad_norm": 0.7024610691029676, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 4959 + }, + { + "epoch": 0.0496, + "grad_norm": 0.634345285618044, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 4960 + }, + { + "epoch": 0.04961, + "grad_norm": 0.5794913569394096, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 4961 + }, + { + "epoch": 0.04962, + "grad_norm": 0.6259340689173926, + "learning_rate": 0.003, + "loss": 4.1383, + "step": 4962 + }, + { + "epoch": 0.04963, + "grad_norm": 0.6609688779347547, + "learning_rate": 0.003, + "loss": 4.176, + "step": 4963 + }, + { + "epoch": 0.04964, + "grad_norm": 0.6719506081449138, + "learning_rate": 0.003, + "loss": 4.1487, + "step": 4964 + }, + { + "epoch": 0.04965, + "grad_norm": 0.6471802630352936, + "learning_rate": 0.003, + "loss": 4.1463, + "step": 4965 + }, + { + "epoch": 0.04966, + "grad_norm": 0.7401257014237659, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 4966 + }, + { + "epoch": 0.04967, + "grad_norm": 0.7264688238953103, + "learning_rate": 0.003, + "loss": 4.1502, + "step": 4967 + }, + { + "epoch": 0.04968, + "grad_norm": 0.6850630441933709, + "learning_rate": 0.003, + "loss": 4.1738, + "step": 4968 + }, + { + "epoch": 0.04969, + "grad_norm": 0.7103603969004639, + "learning_rate": 0.003, + "loss": 4.148, + "step": 4969 + }, + { + "epoch": 0.0497, + "grad_norm": 0.7119697504065976, + "learning_rate": 0.003, + "loss": 4.1431, + "step": 4970 + }, + { + "epoch": 0.04971, + "grad_norm": 0.6691773043773404, + "learning_rate": 0.003, + "loss": 4.1618, + "step": 4971 + }, + { + "epoch": 0.04972, + "grad_norm": 0.7584471656947329, + "learning_rate": 0.003, + "loss": 4.1367, + "step": 4972 + }, + { + "epoch": 0.04973, + "grad_norm": 0.7074430853606097, + "learning_rate": 0.003, + "loss": 4.1896, + "step": 4973 + }, + { + "epoch": 0.04974, + "grad_norm": 0.5713006659555607, + "learning_rate": 0.003, + "loss": 4.1443, + "step": 4974 + }, + { + "epoch": 0.04975, + "grad_norm": 0.5130707295349296, + "learning_rate": 0.003, + "loss": 4.1369, + "step": 4975 + }, + { + "epoch": 0.04976, + "grad_norm": 0.476744226744805, + "learning_rate": 0.003, + "loss": 4.1354, + "step": 4976 + }, + { + "epoch": 0.04977, + "grad_norm": 0.436532472120347, + "learning_rate": 0.003, + "loss": 4.1425, + "step": 4977 + }, + { + "epoch": 0.04978, + "grad_norm": 0.4481149086483841, + "learning_rate": 0.003, + "loss": 4.1405, + "step": 4978 + }, + { + "epoch": 0.04979, + "grad_norm": 0.4658196526981218, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 4979 + }, + { + "epoch": 0.0498, + "grad_norm": 0.48184153753418996, + "learning_rate": 0.003, + "loss": 4.1351, + "step": 4980 + }, + { + "epoch": 0.04981, + "grad_norm": 0.5711200331385257, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 4981 + }, + { + "epoch": 0.04982, + "grad_norm": 0.7811982054881438, + "learning_rate": 0.003, + "loss": 4.1394, + "step": 4982 + }, + { + "epoch": 0.04983, + "grad_norm": 0.9119195104356123, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 4983 + }, + { + "epoch": 0.04984, + "grad_norm": 0.809114284515937, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 4984 + }, + { + "epoch": 0.04985, + "grad_norm": 0.6537212838716787, + "learning_rate": 0.003, + "loss": 4.1368, + "step": 4985 + }, + { + "epoch": 0.04986, + "grad_norm": 0.6869403529206495, + "learning_rate": 0.003, + "loss": 4.1589, + "step": 4986 + }, + { + "epoch": 0.04987, + "grad_norm": 0.613841094604171, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 4987 + }, + { + "epoch": 0.04988, + "grad_norm": 0.6618484956538617, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 4988 + }, + { + "epoch": 0.04989, + "grad_norm": 0.5936176795447469, + "learning_rate": 0.003, + "loss": 4.1518, + "step": 4989 + }, + { + "epoch": 0.0499, + "grad_norm": 0.5392404418710565, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 4990 + }, + { + "epoch": 0.04991, + "grad_norm": 0.621246461554044, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 4991 + }, + { + "epoch": 0.04992, + "grad_norm": 0.670783622770409, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 4992 + }, + { + "epoch": 0.04993, + "grad_norm": 0.6045345128052574, + "learning_rate": 0.003, + "loss": 4.1313, + "step": 4993 + }, + { + "epoch": 0.04994, + "grad_norm": 0.6101486114599416, + "learning_rate": 0.003, + "loss": 4.1165, + "step": 4994 + }, + { + "epoch": 0.04995, + "grad_norm": 0.55705863270894, + "learning_rate": 0.003, + "loss": 4.1366, + "step": 4995 + }, + { + "epoch": 0.04996, + "grad_norm": 0.44788775188911484, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 4996 + }, + { + "epoch": 0.04997, + "grad_norm": 0.4309530448591752, + "learning_rate": 0.003, + "loss": 4.13, + "step": 4997 + }, + { + "epoch": 0.04998, + "grad_norm": 0.428484725286708, + "learning_rate": 0.003, + "loss": 4.1528, + "step": 4998 + }, + { + "epoch": 0.04999, + "grad_norm": 0.41683300725155314, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 4999 + }, + { + "epoch": 0.05, + "grad_norm": 0.3880481560630285, + "learning_rate": 0.003, + "loss": 4.1518, + "step": 5000 + }, + { + "epoch": 0.05001, + "grad_norm": 0.44002892670528426, + "learning_rate": 0.003, + "loss": 4.1456, + "step": 5001 + }, + { + "epoch": 0.05002, + "grad_norm": 0.5267413563943687, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 5002 + }, + { + "epoch": 0.05003, + "grad_norm": 0.6974673629709173, + "learning_rate": 0.003, + "loss": 4.14, + "step": 5003 + }, + { + "epoch": 0.05004, + "grad_norm": 0.858937261820554, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 5004 + }, + { + "epoch": 0.05005, + "grad_norm": 0.9141406787634108, + "learning_rate": 0.003, + "loss": 4.1638, + "step": 5005 + }, + { + "epoch": 0.05006, + "grad_norm": 0.687864263963555, + "learning_rate": 0.003, + "loss": 4.1338, + "step": 5006 + }, + { + "epoch": 0.05007, + "grad_norm": 0.5773822019308112, + "learning_rate": 0.003, + "loss": 4.147, + "step": 5007 + }, + { + "epoch": 0.05008, + "grad_norm": 0.5547957033613983, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 5008 + }, + { + "epoch": 0.05009, + "grad_norm": 0.5411686344353427, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 5009 + }, + { + "epoch": 0.0501, + "grad_norm": 0.489119317455081, + "learning_rate": 0.003, + "loss": 4.134, + "step": 5010 + }, + { + "epoch": 0.05011, + "grad_norm": 0.464838598175237, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 5011 + }, + { + "epoch": 0.05012, + "grad_norm": 0.4580943694467521, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 5012 + }, + { + "epoch": 0.05013, + "grad_norm": 0.4808521248590666, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 5013 + }, + { + "epoch": 0.05014, + "grad_norm": 0.5229887414294506, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 5014 + }, + { + "epoch": 0.05015, + "grad_norm": 0.5777151908604863, + "learning_rate": 0.003, + "loss": 4.14, + "step": 5015 + }, + { + "epoch": 0.05016, + "grad_norm": 0.5235213366994899, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 5016 + }, + { + "epoch": 0.05017, + "grad_norm": 0.445246255748685, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 5017 + }, + { + "epoch": 0.05018, + "grad_norm": 0.45246478410906943, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 5018 + }, + { + "epoch": 0.05019, + "grad_norm": 0.5231762403732283, + "learning_rate": 0.003, + "loss": 4.1477, + "step": 5019 + }, + { + "epoch": 0.0502, + "grad_norm": 0.6594471849420648, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 5020 + }, + { + "epoch": 0.05021, + "grad_norm": 0.6904442298729323, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 5021 + }, + { + "epoch": 0.05022, + "grad_norm": 0.5909951697887482, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 5022 + }, + { + "epoch": 0.05023, + "grad_norm": 0.577685062094026, + "learning_rate": 0.003, + "loss": 4.1484, + "step": 5023 + }, + { + "epoch": 0.05024, + "grad_norm": 0.5728157983655138, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 5024 + }, + { + "epoch": 0.05025, + "grad_norm": 0.5379702885678885, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 5025 + }, + { + "epoch": 0.05026, + "grad_norm": 0.6155371625202312, + "learning_rate": 0.003, + "loss": 4.1225, + "step": 5026 + }, + { + "epoch": 0.05027, + "grad_norm": 0.7519388420060837, + "learning_rate": 0.003, + "loss": 4.1457, + "step": 5027 + }, + { + "epoch": 0.05028, + "grad_norm": 0.9311839881105047, + "learning_rate": 0.003, + "loss": 4.1411, + "step": 5028 + }, + { + "epoch": 0.05029, + "grad_norm": 0.9260888417245792, + "learning_rate": 0.003, + "loss": 4.1534, + "step": 5029 + }, + { + "epoch": 0.0503, + "grad_norm": 0.7728492641666489, + "learning_rate": 0.003, + "loss": 4.123, + "step": 5030 + }, + { + "epoch": 0.05031, + "grad_norm": 0.777930269535098, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 5031 + }, + { + "epoch": 0.05032, + "grad_norm": 0.7350784470721563, + "learning_rate": 0.003, + "loss": 4.145, + "step": 5032 + }, + { + "epoch": 0.05033, + "grad_norm": 0.5291553968829333, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 5033 + }, + { + "epoch": 0.05034, + "grad_norm": 0.4919036368843819, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 5034 + }, + { + "epoch": 0.05035, + "grad_norm": 0.4975664864391519, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 5035 + }, + { + "epoch": 0.05036, + "grad_norm": 0.5163959831953332, + "learning_rate": 0.003, + "loss": 4.1211, + "step": 5036 + }, + { + "epoch": 0.05037, + "grad_norm": 0.5569175575619792, + "learning_rate": 0.003, + "loss": 4.1147, + "step": 5037 + }, + { + "epoch": 0.05038, + "grad_norm": 0.5227477080467972, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 5038 + }, + { + "epoch": 0.05039, + "grad_norm": 0.5597860102419677, + "learning_rate": 0.003, + "loss": 4.1379, + "step": 5039 + }, + { + "epoch": 0.0504, + "grad_norm": 0.6074226279213635, + "learning_rate": 0.003, + "loss": 4.1376, + "step": 5040 + }, + { + "epoch": 0.05041, + "grad_norm": 0.6238557653084295, + "learning_rate": 0.003, + "loss": 4.1429, + "step": 5041 + }, + { + "epoch": 0.05042, + "grad_norm": 0.6153407136204087, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 5042 + }, + { + "epoch": 0.05043, + "grad_norm": 0.6514614471239298, + "learning_rate": 0.003, + "loss": 4.1329, + "step": 5043 + }, + { + "epoch": 0.05044, + "grad_norm": 0.6930287002961413, + "learning_rate": 0.003, + "loss": 4.1501, + "step": 5044 + }, + { + "epoch": 0.05045, + "grad_norm": 0.8304592547990185, + "learning_rate": 0.003, + "loss": 4.1731, + "step": 5045 + }, + { + "epoch": 0.05046, + "grad_norm": 0.8026034540044529, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 5046 + }, + { + "epoch": 0.05047, + "grad_norm": 0.9245681532952602, + "learning_rate": 0.003, + "loss": 4.1591, + "step": 5047 + }, + { + "epoch": 0.05048, + "grad_norm": 0.8960724676621618, + "learning_rate": 0.003, + "loss": 4.1587, + "step": 5048 + }, + { + "epoch": 0.05049, + "grad_norm": 1.0151937627560343, + "learning_rate": 0.003, + "loss": 4.1266, + "step": 5049 + }, + { + "epoch": 0.0505, + "grad_norm": 0.884162859927295, + "learning_rate": 0.003, + "loss": 4.2198, + "step": 5050 + }, + { + "epoch": 0.05051, + "grad_norm": 0.8307433200396237, + "learning_rate": 0.003, + "loss": 4.1792, + "step": 5051 + }, + { + "epoch": 0.05052, + "grad_norm": 0.7384842172109713, + "learning_rate": 0.003, + "loss": 4.1467, + "step": 5052 + }, + { + "epoch": 0.05053, + "grad_norm": 0.7203012020157923, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 5053 + }, + { + "epoch": 0.05054, + "grad_norm": 0.7327188596143545, + "learning_rate": 0.003, + "loss": 4.1775, + "step": 5054 + }, + { + "epoch": 0.05055, + "grad_norm": 0.7559923263710384, + "learning_rate": 0.003, + "loss": 4.1551, + "step": 5055 + }, + { + "epoch": 0.05056, + "grad_norm": 0.6831499776250408, + "learning_rate": 0.003, + "loss": 4.1645, + "step": 5056 + }, + { + "epoch": 0.05057, + "grad_norm": 0.6878997827607911, + "learning_rate": 0.003, + "loss": 4.169, + "step": 5057 + }, + { + "epoch": 0.05058, + "grad_norm": 0.6963511057251066, + "learning_rate": 0.003, + "loss": 4.1689, + "step": 5058 + }, + { + "epoch": 0.05059, + "grad_norm": 0.7218382309881082, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 5059 + }, + { + "epoch": 0.0506, + "grad_norm": 0.7641404073482049, + "learning_rate": 0.003, + "loss": 4.1698, + "step": 5060 + }, + { + "epoch": 0.05061, + "grad_norm": 0.7492054114953671, + "learning_rate": 0.003, + "loss": 4.1514, + "step": 5061 + }, + { + "epoch": 0.05062, + "grad_norm": 0.580002983697803, + "learning_rate": 0.003, + "loss": 4.1258, + "step": 5062 + }, + { + "epoch": 0.05063, + "grad_norm": 0.465857319923329, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 5063 + }, + { + "epoch": 0.05064, + "grad_norm": 0.4639475584360793, + "learning_rate": 0.003, + "loss": 4.1478, + "step": 5064 + }, + { + "epoch": 0.05065, + "grad_norm": 0.4316878203187786, + "learning_rate": 0.003, + "loss": 4.1319, + "step": 5065 + }, + { + "epoch": 0.05066, + "grad_norm": 0.4129716036113381, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 5066 + }, + { + "epoch": 0.05067, + "grad_norm": 0.3545337277972109, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 5067 + }, + { + "epoch": 0.05068, + "grad_norm": 0.35000428560320695, + "learning_rate": 0.003, + "loss": 4.1455, + "step": 5068 + }, + { + "epoch": 0.05069, + "grad_norm": 0.28475691607233394, + "learning_rate": 0.003, + "loss": 4.1311, + "step": 5069 + }, + { + "epoch": 0.0507, + "grad_norm": 0.3211180881977199, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 5070 + }, + { + "epoch": 0.05071, + "grad_norm": 0.31520195677598756, + "learning_rate": 0.003, + "loss": 4.1319, + "step": 5071 + }, + { + "epoch": 0.05072, + "grad_norm": 0.33671460211119836, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 5072 + }, + { + "epoch": 0.05073, + "grad_norm": 0.33769028368475607, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 5073 + }, + { + "epoch": 0.05074, + "grad_norm": 0.3448787140101478, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 5074 + }, + { + "epoch": 0.05075, + "grad_norm": 0.3957609850918836, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 5075 + }, + { + "epoch": 0.05076, + "grad_norm": 0.48241503749103687, + "learning_rate": 0.003, + "loss": 4.139, + "step": 5076 + }, + { + "epoch": 0.05077, + "grad_norm": 0.6132233951932243, + "learning_rate": 0.003, + "loss": 4.1467, + "step": 5077 + }, + { + "epoch": 0.05078, + "grad_norm": 0.7745174445055403, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 5078 + }, + { + "epoch": 0.05079, + "grad_norm": 0.8279381574948442, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 5079 + }, + { + "epoch": 0.0508, + "grad_norm": 0.8263557958238591, + "learning_rate": 0.003, + "loss": 4.1182, + "step": 5080 + }, + { + "epoch": 0.05081, + "grad_norm": 0.8277412011902472, + "learning_rate": 0.003, + "loss": 4.1478, + "step": 5081 + }, + { + "epoch": 0.05082, + "grad_norm": 0.7915182287182168, + "learning_rate": 0.003, + "loss": 4.1565, + "step": 5082 + }, + { + "epoch": 0.05083, + "grad_norm": 0.7357570092313975, + "learning_rate": 0.003, + "loss": 4.1198, + "step": 5083 + }, + { + "epoch": 0.05084, + "grad_norm": 0.9633741172415596, + "learning_rate": 0.003, + "loss": 4.1468, + "step": 5084 + }, + { + "epoch": 0.05085, + "grad_norm": 1.0990777684274788, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 5085 + }, + { + "epoch": 0.05086, + "grad_norm": 0.7846549440538764, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 5086 + }, + { + "epoch": 0.05087, + "grad_norm": 0.6846851816877524, + "learning_rate": 0.003, + "loss": 4.1432, + "step": 5087 + }, + { + "epoch": 0.05088, + "grad_norm": 0.6318100802689848, + "learning_rate": 0.003, + "loss": 4.1434, + "step": 5088 + }, + { + "epoch": 0.05089, + "grad_norm": 0.6520703017626633, + "learning_rate": 0.003, + "loss": 4.1387, + "step": 5089 + }, + { + "epoch": 0.0509, + "grad_norm": 0.5565734582358871, + "learning_rate": 0.003, + "loss": 4.1698, + "step": 5090 + }, + { + "epoch": 0.05091, + "grad_norm": 0.5349654586172464, + "learning_rate": 0.003, + "loss": 4.1305, + "step": 5091 + }, + { + "epoch": 0.05092, + "grad_norm": 0.5297741402897067, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 5092 + }, + { + "epoch": 0.05093, + "grad_norm": 0.5575684104199916, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 5093 + }, + { + "epoch": 0.05094, + "grad_norm": 0.5605888909754495, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 5094 + }, + { + "epoch": 0.05095, + "grad_norm": 0.5343018695121468, + "learning_rate": 0.003, + "loss": 4.1411, + "step": 5095 + }, + { + "epoch": 0.05096, + "grad_norm": 0.43949601788526343, + "learning_rate": 0.003, + "loss": 4.1235, + "step": 5096 + }, + { + "epoch": 0.05097, + "grad_norm": 0.4810979086570852, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 5097 + }, + { + "epoch": 0.05098, + "grad_norm": 0.4654228949280776, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 5098 + }, + { + "epoch": 0.05099, + "grad_norm": 0.40501669029112214, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 5099 + }, + { + "epoch": 0.051, + "grad_norm": 0.4223377546263004, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 5100 + }, + { + "epoch": 0.05101, + "grad_norm": 0.44525788101780217, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 5101 + }, + { + "epoch": 0.05102, + "grad_norm": 0.4871872595505277, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 5102 + }, + { + "epoch": 0.05103, + "grad_norm": 0.5447347242656629, + "learning_rate": 0.003, + "loss": 4.1317, + "step": 5103 + }, + { + "epoch": 0.05104, + "grad_norm": 0.5964115151305719, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 5104 + }, + { + "epoch": 0.05105, + "grad_norm": 0.5867716136824593, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 5105 + }, + { + "epoch": 0.05106, + "grad_norm": 0.5562979835891493, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 5106 + }, + { + "epoch": 0.05107, + "grad_norm": 0.5181396244592503, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 5107 + }, + { + "epoch": 0.05108, + "grad_norm": 0.5440116113449289, + "learning_rate": 0.003, + "loss": 4.1537, + "step": 5108 + }, + { + "epoch": 0.05109, + "grad_norm": 0.5961075613550932, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 5109 + }, + { + "epoch": 0.0511, + "grad_norm": 0.533390436132316, + "learning_rate": 0.003, + "loss": 4.1299, + "step": 5110 + }, + { + "epoch": 0.05111, + "grad_norm": 0.48024251864431305, + "learning_rate": 0.003, + "loss": 4.1412, + "step": 5111 + }, + { + "epoch": 0.05112, + "grad_norm": 0.4598495045627523, + "learning_rate": 0.003, + "loss": 4.1265, + "step": 5112 + }, + { + "epoch": 0.05113, + "grad_norm": 0.5112726037297928, + "learning_rate": 0.003, + "loss": 4.1173, + "step": 5113 + }, + { + "epoch": 0.05114, + "grad_norm": 0.6162499535834111, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 5114 + }, + { + "epoch": 0.05115, + "grad_norm": 0.6330435450833568, + "learning_rate": 0.003, + "loss": 4.1274, + "step": 5115 + }, + { + "epoch": 0.05116, + "grad_norm": 0.7922157375423148, + "learning_rate": 0.003, + "loss": 4.1422, + "step": 5116 + }, + { + "epoch": 0.05117, + "grad_norm": 0.8777485197768994, + "learning_rate": 0.003, + "loss": 4.1274, + "step": 5117 + }, + { + "epoch": 0.05118, + "grad_norm": 0.8487101677130807, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 5118 + }, + { + "epoch": 0.05119, + "grad_norm": 0.6672995585854586, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 5119 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5347949292529923, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 5120 + }, + { + "epoch": 0.05121, + "grad_norm": 0.47651085513877506, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 5121 + }, + { + "epoch": 0.05122, + "grad_norm": 0.5586400651301906, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 5122 + }, + { + "epoch": 0.05123, + "grad_norm": 0.6300817408188087, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 5123 + }, + { + "epoch": 0.05124, + "grad_norm": 0.6701205648572075, + "learning_rate": 0.003, + "loss": 4.1609, + "step": 5124 + }, + { + "epoch": 0.05125, + "grad_norm": 0.6468239470621561, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 5125 + }, + { + "epoch": 0.05126, + "grad_norm": 0.5445267421148194, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 5126 + }, + { + "epoch": 0.05127, + "grad_norm": 0.4799763707457368, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 5127 + }, + { + "epoch": 0.05128, + "grad_norm": 0.5803016836080052, + "learning_rate": 0.003, + "loss": 4.1391, + "step": 5128 + }, + { + "epoch": 0.05129, + "grad_norm": 0.678952357079139, + "learning_rate": 0.003, + "loss": 4.1115, + "step": 5129 + }, + { + "epoch": 0.0513, + "grad_norm": 0.8039176919182028, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 5130 + }, + { + "epoch": 0.05131, + "grad_norm": 0.8445292736195356, + "learning_rate": 0.003, + "loss": 4.1456, + "step": 5131 + }, + { + "epoch": 0.05132, + "grad_norm": 0.8088539903803569, + "learning_rate": 0.003, + "loss": 4.131, + "step": 5132 + }, + { + "epoch": 0.05133, + "grad_norm": 0.8452742411739833, + "learning_rate": 0.003, + "loss": 4.1457, + "step": 5133 + }, + { + "epoch": 0.05134, + "grad_norm": 0.7211547370019534, + "learning_rate": 0.003, + "loss": 4.1522, + "step": 5134 + }, + { + "epoch": 0.05135, + "grad_norm": 0.7552143125752953, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 5135 + }, + { + "epoch": 0.05136, + "grad_norm": 0.8178560583882517, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 5136 + }, + { + "epoch": 0.05137, + "grad_norm": 0.7937255401431907, + "learning_rate": 0.003, + "loss": 4.1554, + "step": 5137 + }, + { + "epoch": 0.05138, + "grad_norm": 0.7029385951002983, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 5138 + }, + { + "epoch": 0.05139, + "grad_norm": 0.755399264561879, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 5139 + }, + { + "epoch": 0.0514, + "grad_norm": 0.6372525763339177, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 5140 + }, + { + "epoch": 0.05141, + "grad_norm": 0.6168576930336179, + "learning_rate": 0.003, + "loss": 4.1854, + "step": 5141 + }, + { + "epoch": 0.05142, + "grad_norm": 0.6036281974051024, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 5142 + }, + { + "epoch": 0.05143, + "grad_norm": 0.5036860750600824, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 5143 + }, + { + "epoch": 0.05144, + "grad_norm": 0.46857974439093014, + "learning_rate": 0.003, + "loss": 4.1162, + "step": 5144 + }, + { + "epoch": 0.05145, + "grad_norm": 0.42494497123142294, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 5145 + }, + { + "epoch": 0.05146, + "grad_norm": 0.44380638929392596, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 5146 + }, + { + "epoch": 0.05147, + "grad_norm": 0.40327298093540914, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 5147 + }, + { + "epoch": 0.05148, + "grad_norm": 0.3953841068063316, + "learning_rate": 0.003, + "loss": 4.128, + "step": 5148 + }, + { + "epoch": 0.05149, + "grad_norm": 0.4012036518868883, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 5149 + }, + { + "epoch": 0.0515, + "grad_norm": 0.39949323869855685, + "learning_rate": 0.003, + "loss": 4.1369, + "step": 5150 + }, + { + "epoch": 0.05151, + "grad_norm": 0.3876338989691726, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 5151 + }, + { + "epoch": 0.05152, + "grad_norm": 0.43621474925166587, + "learning_rate": 0.003, + "loss": 4.1375, + "step": 5152 + }, + { + "epoch": 0.05153, + "grad_norm": 0.5301350008384232, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 5153 + }, + { + "epoch": 0.05154, + "grad_norm": 0.7035426773318153, + "learning_rate": 0.003, + "loss": 4.1299, + "step": 5154 + }, + { + "epoch": 0.05155, + "grad_norm": 0.9094366977526885, + "learning_rate": 0.003, + "loss": 4.1434, + "step": 5155 + }, + { + "epoch": 0.05156, + "grad_norm": 0.8124683719101861, + "learning_rate": 0.003, + "loss": 4.1683, + "step": 5156 + }, + { + "epoch": 0.05157, + "grad_norm": 0.6963497652538516, + "learning_rate": 0.003, + "loss": 4.1343, + "step": 5157 + }, + { + "epoch": 0.05158, + "grad_norm": 0.7426530943881816, + "learning_rate": 0.003, + "loss": 4.1305, + "step": 5158 + }, + { + "epoch": 0.05159, + "grad_norm": 0.6705321816606814, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 5159 + }, + { + "epoch": 0.0516, + "grad_norm": 0.6026791060312885, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 5160 + }, + { + "epoch": 0.05161, + "grad_norm": 0.6460386669653281, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 5161 + }, + { + "epoch": 0.05162, + "grad_norm": 0.7274049490439595, + "learning_rate": 0.003, + "loss": 4.1253, + "step": 5162 + }, + { + "epoch": 0.05163, + "grad_norm": 0.6827610438042071, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 5163 + }, + { + "epoch": 0.05164, + "grad_norm": 0.6633511077303998, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 5164 + }, + { + "epoch": 0.05165, + "grad_norm": 0.8069016890146392, + "learning_rate": 0.003, + "loss": 4.1464, + "step": 5165 + }, + { + "epoch": 0.05166, + "grad_norm": 0.7564185954506445, + "learning_rate": 0.003, + "loss": 4.1466, + "step": 5166 + }, + { + "epoch": 0.05167, + "grad_norm": 0.6729290579348203, + "learning_rate": 0.003, + "loss": 4.1509, + "step": 5167 + }, + { + "epoch": 0.05168, + "grad_norm": 0.681973653519437, + "learning_rate": 0.003, + "loss": 4.1784, + "step": 5168 + }, + { + "epoch": 0.05169, + "grad_norm": 0.6894995787523694, + "learning_rate": 0.003, + "loss": 4.1321, + "step": 5169 + }, + { + "epoch": 0.0517, + "grad_norm": 0.6251406068036489, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 5170 + }, + { + "epoch": 0.05171, + "grad_norm": 0.6022206459158862, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 5171 + }, + { + "epoch": 0.05172, + "grad_norm": 0.5547021446284994, + "learning_rate": 0.003, + "loss": 4.1413, + "step": 5172 + }, + { + "epoch": 0.05173, + "grad_norm": 0.5511637086768453, + "learning_rate": 0.003, + "loss": 4.1536, + "step": 5173 + }, + { + "epoch": 0.05174, + "grad_norm": 0.549834258038828, + "learning_rate": 0.003, + "loss": 4.1405, + "step": 5174 + }, + { + "epoch": 0.05175, + "grad_norm": 0.5143957250446763, + "learning_rate": 0.003, + "loss": 4.1488, + "step": 5175 + }, + { + "epoch": 0.05176, + "grad_norm": 0.5016512485210078, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 5176 + }, + { + "epoch": 0.05177, + "grad_norm": 0.5010375401941776, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 5177 + }, + { + "epoch": 0.05178, + "grad_norm": 0.45886355153966824, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 5178 + }, + { + "epoch": 0.05179, + "grad_norm": 0.4774614456980692, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 5179 + }, + { + "epoch": 0.0518, + "grad_norm": 0.5343856749000508, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 5180 + }, + { + "epoch": 0.05181, + "grad_norm": 0.5847156904354145, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 5181 + }, + { + "epoch": 0.05182, + "grad_norm": 0.7281346718853275, + "learning_rate": 0.003, + "loss": 4.1458, + "step": 5182 + }, + { + "epoch": 0.05183, + "grad_norm": 0.7959120436672125, + "learning_rate": 0.003, + "loss": 4.1146, + "step": 5183 + }, + { + "epoch": 0.05184, + "grad_norm": 0.7708701874016135, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 5184 + }, + { + "epoch": 0.05185, + "grad_norm": 0.7830980202404018, + "learning_rate": 0.003, + "loss": 4.1255, + "step": 5185 + }, + { + "epoch": 0.05186, + "grad_norm": 0.6793457202900599, + "learning_rate": 0.003, + "loss": 4.1328, + "step": 5186 + }, + { + "epoch": 0.05187, + "grad_norm": 0.6855736266075582, + "learning_rate": 0.003, + "loss": 4.171, + "step": 5187 + }, + { + "epoch": 0.05188, + "grad_norm": 0.760859985233772, + "learning_rate": 0.003, + "loss": 4.1501, + "step": 5188 + }, + { + "epoch": 0.05189, + "grad_norm": 0.7264115896262829, + "learning_rate": 0.003, + "loss": 4.124, + "step": 5189 + }, + { + "epoch": 0.0519, + "grad_norm": 0.7175290962540748, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 5190 + }, + { + "epoch": 0.05191, + "grad_norm": 0.6496658085565145, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 5191 + }, + { + "epoch": 0.05192, + "grad_norm": 0.6267055244498101, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 5192 + }, + { + "epoch": 0.05193, + "grad_norm": 0.5727047155848178, + "learning_rate": 0.003, + "loss": 4.1409, + "step": 5193 + }, + { + "epoch": 0.05194, + "grad_norm": 0.5491895800888712, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 5194 + }, + { + "epoch": 0.05195, + "grad_norm": 0.5312113948934261, + "learning_rate": 0.003, + "loss": 4.1422, + "step": 5195 + }, + { + "epoch": 0.05196, + "grad_norm": 0.5838275239944962, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 5196 + }, + { + "epoch": 0.05197, + "grad_norm": 0.5319117287619517, + "learning_rate": 0.003, + "loss": 4.1404, + "step": 5197 + }, + { + "epoch": 0.05198, + "grad_norm": 0.474231170755562, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 5198 + }, + { + "epoch": 0.05199, + "grad_norm": 0.5339097503934678, + "learning_rate": 0.003, + "loss": 4.098, + "step": 5199 + }, + { + "epoch": 0.052, + "grad_norm": 0.552513644020886, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 5200 + }, + { + "epoch": 0.05201, + "grad_norm": 0.5499912684018807, + "learning_rate": 0.003, + "loss": 4.1284, + "step": 5201 + }, + { + "epoch": 0.05202, + "grad_norm": 0.5884166479851186, + "learning_rate": 0.003, + "loss": 4.114, + "step": 5202 + }, + { + "epoch": 0.05203, + "grad_norm": 0.6234832076161475, + "learning_rate": 0.003, + "loss": 4.1481, + "step": 5203 + }, + { + "epoch": 0.05204, + "grad_norm": 0.6370950178244867, + "learning_rate": 0.003, + "loss": 4.1359, + "step": 5204 + }, + { + "epoch": 0.05205, + "grad_norm": 0.7268995146225012, + "learning_rate": 0.003, + "loss": 4.1446, + "step": 5205 + }, + { + "epoch": 0.05206, + "grad_norm": 0.8429209951426253, + "learning_rate": 0.003, + "loss": 4.1341, + "step": 5206 + }, + { + "epoch": 0.05207, + "grad_norm": 0.891067417785423, + "learning_rate": 0.003, + "loss": 4.1211, + "step": 5207 + }, + { + "epoch": 0.05208, + "grad_norm": 0.880801827049288, + "learning_rate": 0.003, + "loss": 4.1419, + "step": 5208 + }, + { + "epoch": 0.05209, + "grad_norm": 0.743628891207245, + "learning_rate": 0.003, + "loss": 4.1311, + "step": 5209 + }, + { + "epoch": 0.0521, + "grad_norm": 0.6585381210736347, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 5210 + }, + { + "epoch": 0.05211, + "grad_norm": 0.6590374105413944, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 5211 + }, + { + "epoch": 0.05212, + "grad_norm": 0.5838327606908597, + "learning_rate": 0.003, + "loss": 4.1734, + "step": 5212 + }, + { + "epoch": 0.05213, + "grad_norm": 0.5577967457265012, + "learning_rate": 0.003, + "loss": 4.1237, + "step": 5213 + }, + { + "epoch": 0.05214, + "grad_norm": 0.5831733721175694, + "learning_rate": 0.003, + "loss": 4.1188, + "step": 5214 + }, + { + "epoch": 0.05215, + "grad_norm": 0.5947055436999698, + "learning_rate": 0.003, + "loss": 4.1122, + "step": 5215 + }, + { + "epoch": 0.05216, + "grad_norm": 0.5162752873889312, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 5216 + }, + { + "epoch": 0.05217, + "grad_norm": 0.5215914347147623, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 5217 + }, + { + "epoch": 0.05218, + "grad_norm": 0.495811759691527, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 5218 + }, + { + "epoch": 0.05219, + "grad_norm": 0.5616102027479555, + "learning_rate": 0.003, + "loss": 4.1402, + "step": 5219 + }, + { + "epoch": 0.0522, + "grad_norm": 0.596993154601823, + "learning_rate": 0.003, + "loss": 4.107, + "step": 5220 + }, + { + "epoch": 0.05221, + "grad_norm": 0.6135656226590566, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 5221 + }, + { + "epoch": 0.05222, + "grad_norm": 0.6651818813368179, + "learning_rate": 0.003, + "loss": 4.1298, + "step": 5222 + }, + { + "epoch": 0.05223, + "grad_norm": 0.693814397802243, + "learning_rate": 0.003, + "loss": 4.1454, + "step": 5223 + }, + { + "epoch": 0.05224, + "grad_norm": 0.6753245461392549, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 5224 + }, + { + "epoch": 0.05225, + "grad_norm": 0.5962298095800613, + "learning_rate": 0.003, + "loss": 4.145, + "step": 5225 + }, + { + "epoch": 0.05226, + "grad_norm": 0.5681174514487727, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 5226 + }, + { + "epoch": 0.05227, + "grad_norm": 0.6355387480928073, + "learning_rate": 0.003, + "loss": 4.1595, + "step": 5227 + }, + { + "epoch": 0.05228, + "grad_norm": 0.63070568269053, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 5228 + }, + { + "epoch": 0.05229, + "grad_norm": 0.6541251553608277, + "learning_rate": 0.003, + "loss": 4.1063, + "step": 5229 + }, + { + "epoch": 0.0523, + "grad_norm": 0.5976720272605445, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 5230 + }, + { + "epoch": 0.05231, + "grad_norm": 0.5666985347278075, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 5231 + }, + { + "epoch": 0.05232, + "grad_norm": 0.5817937128133905, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 5232 + }, + { + "epoch": 0.05233, + "grad_norm": 0.5938688559238984, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 5233 + }, + { + "epoch": 0.05234, + "grad_norm": 0.5954347096748437, + "learning_rate": 0.003, + "loss": 4.1505, + "step": 5234 + }, + { + "epoch": 0.05235, + "grad_norm": 0.5732073934324781, + "learning_rate": 0.003, + "loss": 4.127, + "step": 5235 + }, + { + "epoch": 0.05236, + "grad_norm": 0.5696284586443667, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 5236 + }, + { + "epoch": 0.05237, + "grad_norm": 0.5556858968355276, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 5237 + }, + { + "epoch": 0.05238, + "grad_norm": 0.5512655986372486, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 5238 + }, + { + "epoch": 0.05239, + "grad_norm": 0.6544063071682276, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 5239 + }, + { + "epoch": 0.0524, + "grad_norm": 0.7356500161192003, + "learning_rate": 0.003, + "loss": 4.1391, + "step": 5240 + }, + { + "epoch": 0.05241, + "grad_norm": 0.88373689710533, + "learning_rate": 0.003, + "loss": 4.1526, + "step": 5241 + }, + { + "epoch": 0.05242, + "grad_norm": 0.811345798439545, + "learning_rate": 0.003, + "loss": 4.1503, + "step": 5242 + }, + { + "epoch": 0.05243, + "grad_norm": 0.7905544586858798, + "learning_rate": 0.003, + "loss": 4.1438, + "step": 5243 + }, + { + "epoch": 0.05244, + "grad_norm": 0.7298808104661723, + "learning_rate": 0.003, + "loss": 4.1179, + "step": 5244 + }, + { + "epoch": 0.05245, + "grad_norm": 0.7094573896220808, + "learning_rate": 0.003, + "loss": 4.1281, + "step": 5245 + }, + { + "epoch": 0.05246, + "grad_norm": 0.7963225003446655, + "learning_rate": 0.003, + "loss": 4.134, + "step": 5246 + }, + { + "epoch": 0.05247, + "grad_norm": 0.902516557295206, + "learning_rate": 0.003, + "loss": 4.1453, + "step": 5247 + }, + { + "epoch": 0.05248, + "grad_norm": 0.7878245147008873, + "learning_rate": 0.003, + "loss": 4.1491, + "step": 5248 + }, + { + "epoch": 0.05249, + "grad_norm": 0.6945422313261818, + "learning_rate": 0.003, + "loss": 4.1562, + "step": 5249 + }, + { + "epoch": 0.0525, + "grad_norm": 0.622441402060283, + "learning_rate": 0.003, + "loss": 4.146, + "step": 5250 + }, + { + "epoch": 0.05251, + "grad_norm": 0.5779397363926364, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 5251 + }, + { + "epoch": 0.05252, + "grad_norm": 0.577099244293535, + "learning_rate": 0.003, + "loss": 4.1569, + "step": 5252 + }, + { + "epoch": 0.05253, + "grad_norm": 0.5742421525793271, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 5253 + }, + { + "epoch": 0.05254, + "grad_norm": 0.5856615794186555, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 5254 + }, + { + "epoch": 0.05255, + "grad_norm": 0.5988982637518733, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 5255 + }, + { + "epoch": 0.05256, + "grad_norm": 0.6516682432466444, + "learning_rate": 0.003, + "loss": 4.1391, + "step": 5256 + }, + { + "epoch": 0.05257, + "grad_norm": 0.7521657888490443, + "learning_rate": 0.003, + "loss": 4.1339, + "step": 5257 + }, + { + "epoch": 0.05258, + "grad_norm": 0.7752727754327018, + "learning_rate": 0.003, + "loss": 4.138, + "step": 5258 + }, + { + "epoch": 0.05259, + "grad_norm": 0.7507817009513204, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 5259 + }, + { + "epoch": 0.0526, + "grad_norm": 0.8283788110748729, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 5260 + }, + { + "epoch": 0.05261, + "grad_norm": 0.9839152508479165, + "learning_rate": 0.003, + "loss": 4.1354, + "step": 5261 + }, + { + "epoch": 0.05262, + "grad_norm": 0.8181134341889587, + "learning_rate": 0.003, + "loss": 4.1247, + "step": 5262 + }, + { + "epoch": 0.05263, + "grad_norm": 0.8179493296972696, + "learning_rate": 0.003, + "loss": 4.1756, + "step": 5263 + }, + { + "epoch": 0.05264, + "grad_norm": 0.8507075051576984, + "learning_rate": 0.003, + "loss": 4.1758, + "step": 5264 + }, + { + "epoch": 0.05265, + "grad_norm": 0.7875040331282762, + "learning_rate": 0.003, + "loss": 4.1367, + "step": 5265 + }, + { + "epoch": 0.05266, + "grad_norm": 0.7660878019896183, + "learning_rate": 0.003, + "loss": 4.1235, + "step": 5266 + }, + { + "epoch": 0.05267, + "grad_norm": 0.7381530659889034, + "learning_rate": 0.003, + "loss": 4.1753, + "step": 5267 + }, + { + "epoch": 0.05268, + "grad_norm": 0.7516718782373394, + "learning_rate": 0.003, + "loss": 4.1372, + "step": 5268 + }, + { + "epoch": 0.05269, + "grad_norm": 0.7754342018650271, + "learning_rate": 0.003, + "loss": 4.117, + "step": 5269 + }, + { + "epoch": 0.0527, + "grad_norm": 0.7814902853941849, + "learning_rate": 0.003, + "loss": 4.1535, + "step": 5270 + }, + { + "epoch": 0.05271, + "grad_norm": 0.7175519140149296, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 5271 + }, + { + "epoch": 0.05272, + "grad_norm": 0.6180907761221967, + "learning_rate": 0.003, + "loss": 4.166, + "step": 5272 + }, + { + "epoch": 0.05273, + "grad_norm": 0.5866907243765208, + "learning_rate": 0.003, + "loss": 4.1508, + "step": 5273 + }, + { + "epoch": 0.05274, + "grad_norm": 0.7212098565817949, + "learning_rate": 0.003, + "loss": 4.1617, + "step": 5274 + }, + { + "epoch": 0.05275, + "grad_norm": 0.745726351447074, + "learning_rate": 0.003, + "loss": 4.1373, + "step": 5275 + }, + { + "epoch": 0.05276, + "grad_norm": 0.6574672838634651, + "learning_rate": 0.003, + "loss": 4.124, + "step": 5276 + }, + { + "epoch": 0.05277, + "grad_norm": 0.6169430231237321, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 5277 + }, + { + "epoch": 0.05278, + "grad_norm": 0.6357928590591313, + "learning_rate": 0.003, + "loss": 4.1268, + "step": 5278 + }, + { + "epoch": 0.05279, + "grad_norm": 0.6803536948880607, + "learning_rate": 0.003, + "loss": 4.1271, + "step": 5279 + }, + { + "epoch": 0.0528, + "grad_norm": 0.6367797725535698, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 5280 + }, + { + "epoch": 0.05281, + "grad_norm": 0.5426507335239904, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 5281 + }, + { + "epoch": 0.05282, + "grad_norm": 0.47424037246159206, + "learning_rate": 0.003, + "loss": 4.1247, + "step": 5282 + }, + { + "epoch": 0.05283, + "grad_norm": 0.3930494949754586, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 5283 + }, + { + "epoch": 0.05284, + "grad_norm": 0.3934936059529288, + "learning_rate": 0.003, + "loss": 4.1354, + "step": 5284 + }, + { + "epoch": 0.05285, + "grad_norm": 0.4015179406110254, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 5285 + }, + { + "epoch": 0.05286, + "grad_norm": 0.4480587464365292, + "learning_rate": 0.003, + "loss": 4.1513, + "step": 5286 + }, + { + "epoch": 0.05287, + "grad_norm": 0.5070013234550016, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 5287 + }, + { + "epoch": 0.05288, + "grad_norm": 0.4813637426002363, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 5288 + }, + { + "epoch": 0.05289, + "grad_norm": 0.4573299309440364, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 5289 + }, + { + "epoch": 0.0529, + "grad_norm": 0.4657267342352843, + "learning_rate": 0.003, + "loss": 4.1223, + "step": 5290 + }, + { + "epoch": 0.05291, + "grad_norm": 0.44753235119040924, + "learning_rate": 0.003, + "loss": 4.1351, + "step": 5291 + }, + { + "epoch": 0.05292, + "grad_norm": 0.46480492942670476, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 5292 + }, + { + "epoch": 0.05293, + "grad_norm": 0.45730668556713944, + "learning_rate": 0.003, + "loss": 4.1255, + "step": 5293 + }, + { + "epoch": 0.05294, + "grad_norm": 0.4237029704444932, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 5294 + }, + { + "epoch": 0.05295, + "grad_norm": 0.4482914371249565, + "learning_rate": 0.003, + "loss": 4.1222, + "step": 5295 + }, + { + "epoch": 0.05296, + "grad_norm": 0.5493021494866288, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 5296 + }, + { + "epoch": 0.05297, + "grad_norm": 0.8031816093672703, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 5297 + }, + { + "epoch": 0.05298, + "grad_norm": 1.0196685307893005, + "learning_rate": 0.003, + "loss": 4.1477, + "step": 5298 + }, + { + "epoch": 0.05299, + "grad_norm": 0.9550231337660582, + "learning_rate": 0.003, + "loss": 4.1305, + "step": 5299 + }, + { + "epoch": 0.053, + "grad_norm": 0.8307857387808788, + "learning_rate": 0.003, + "loss": 4.1466, + "step": 5300 + }, + { + "epoch": 0.05301, + "grad_norm": 0.7866950778572048, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 5301 + }, + { + "epoch": 0.05302, + "grad_norm": 0.6761619636450853, + "learning_rate": 0.003, + "loss": 4.1314, + "step": 5302 + }, + { + "epoch": 0.05303, + "grad_norm": 0.665701232744064, + "learning_rate": 0.003, + "loss": 4.1395, + "step": 5303 + }, + { + "epoch": 0.05304, + "grad_norm": 0.6461488983224262, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 5304 + }, + { + "epoch": 0.05305, + "grad_norm": 0.5665397238554841, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 5305 + }, + { + "epoch": 0.05306, + "grad_norm": 0.5309591176214379, + "learning_rate": 0.003, + "loss": 4.1396, + "step": 5306 + }, + { + "epoch": 0.05307, + "grad_norm": 0.527472903799991, + "learning_rate": 0.003, + "loss": 4.1367, + "step": 5307 + }, + { + "epoch": 0.05308, + "grad_norm": 0.44640273254720836, + "learning_rate": 0.003, + "loss": 4.1434, + "step": 5308 + }, + { + "epoch": 0.05309, + "grad_norm": 0.4752479060075969, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 5309 + }, + { + "epoch": 0.0531, + "grad_norm": 0.4497310787484318, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 5310 + }, + { + "epoch": 0.05311, + "grad_norm": 0.47563156496846587, + "learning_rate": 0.003, + "loss": 4.1546, + "step": 5311 + }, + { + "epoch": 0.05312, + "grad_norm": 0.508319969010842, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 5312 + }, + { + "epoch": 0.05313, + "grad_norm": 0.5258317632514994, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 5313 + }, + { + "epoch": 0.05314, + "grad_norm": 0.5419446505340356, + "learning_rate": 0.003, + "loss": 4.1235, + "step": 5314 + }, + { + "epoch": 0.05315, + "grad_norm": 0.5617206811761004, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 5315 + }, + { + "epoch": 0.05316, + "grad_norm": 0.6150928093152392, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 5316 + }, + { + "epoch": 0.05317, + "grad_norm": 0.6470618227813498, + "learning_rate": 0.003, + "loss": 4.1359, + "step": 5317 + }, + { + "epoch": 0.05318, + "grad_norm": 0.6497630105833105, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 5318 + }, + { + "epoch": 0.05319, + "grad_norm": 0.6589138797229622, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 5319 + }, + { + "epoch": 0.0532, + "grad_norm": 0.7898491657801758, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 5320 + }, + { + "epoch": 0.05321, + "grad_norm": 0.8933026805238407, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 5321 + }, + { + "epoch": 0.05322, + "grad_norm": 1.0398114885738736, + "learning_rate": 0.003, + "loss": 4.1639, + "step": 5322 + }, + { + "epoch": 0.05323, + "grad_norm": 0.9043069047668215, + "learning_rate": 0.003, + "loss": 4.192, + "step": 5323 + }, + { + "epoch": 0.05324, + "grad_norm": 0.7778016743549974, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 5324 + }, + { + "epoch": 0.05325, + "grad_norm": 0.6980624923718645, + "learning_rate": 0.003, + "loss": 4.1448, + "step": 5325 + }, + { + "epoch": 0.05326, + "grad_norm": 0.6326121552557883, + "learning_rate": 0.003, + "loss": 4.1634, + "step": 5326 + }, + { + "epoch": 0.05327, + "grad_norm": 0.5930605167320887, + "learning_rate": 0.003, + "loss": 4.1424, + "step": 5327 + }, + { + "epoch": 0.05328, + "grad_norm": 0.5642732817761132, + "learning_rate": 0.003, + "loss": 4.1462, + "step": 5328 + }, + { + "epoch": 0.05329, + "grad_norm": 0.6305127051813995, + "learning_rate": 0.003, + "loss": 4.1307, + "step": 5329 + }, + { + "epoch": 0.0533, + "grad_norm": 0.7052704472572536, + "learning_rate": 0.003, + "loss": 4.1527, + "step": 5330 + }, + { + "epoch": 0.05331, + "grad_norm": 0.7552985640042235, + "learning_rate": 0.003, + "loss": 4.1453, + "step": 5331 + }, + { + "epoch": 0.05332, + "grad_norm": 0.6810355156436282, + "learning_rate": 0.003, + "loss": 4.1581, + "step": 5332 + }, + { + "epoch": 0.05333, + "grad_norm": 0.6053097534106456, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 5333 + }, + { + "epoch": 0.05334, + "grad_norm": 0.8017924702361167, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 5334 + }, + { + "epoch": 0.05335, + "grad_norm": 0.8242316752152679, + "learning_rate": 0.003, + "loss": 4.1559, + "step": 5335 + }, + { + "epoch": 0.05336, + "grad_norm": 0.6320025894823955, + "learning_rate": 0.003, + "loss": 4.1609, + "step": 5336 + }, + { + "epoch": 0.05337, + "grad_norm": 0.5224549291900125, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 5337 + }, + { + "epoch": 0.05338, + "grad_norm": 0.5296562283032598, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 5338 + }, + { + "epoch": 0.05339, + "grad_norm": 0.5274630915708725, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 5339 + }, + { + "epoch": 0.0534, + "grad_norm": 0.5537136834473111, + "learning_rate": 0.003, + "loss": 4.1313, + "step": 5340 + }, + { + "epoch": 0.05341, + "grad_norm": 0.5459467303012407, + "learning_rate": 0.003, + "loss": 4.1337, + "step": 5341 + }, + { + "epoch": 0.05342, + "grad_norm": 0.5994941640281616, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 5342 + }, + { + "epoch": 0.05343, + "grad_norm": 0.6071352996442535, + "learning_rate": 0.003, + "loss": 4.137, + "step": 5343 + }, + { + "epoch": 0.05344, + "grad_norm": 0.5345329157634405, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 5344 + }, + { + "epoch": 0.05345, + "grad_norm": 0.5116910157465104, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 5345 + }, + { + "epoch": 0.05346, + "grad_norm": 0.46468708763144895, + "learning_rate": 0.003, + "loss": 4.1229, + "step": 5346 + }, + { + "epoch": 0.05347, + "grad_norm": 0.4409453577269599, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 5347 + }, + { + "epoch": 0.05348, + "grad_norm": 0.4414471535354094, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 5348 + }, + { + "epoch": 0.05349, + "grad_norm": 0.4595368190725746, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 5349 + }, + { + "epoch": 0.0535, + "grad_norm": 0.4867635564262319, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 5350 + }, + { + "epoch": 0.05351, + "grad_norm": 0.5386138190961993, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 5351 + }, + { + "epoch": 0.05352, + "grad_norm": 0.6349645099056564, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 5352 + }, + { + "epoch": 0.05353, + "grad_norm": 0.7163335707083324, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 5353 + }, + { + "epoch": 0.05354, + "grad_norm": 0.7284832017243276, + "learning_rate": 0.003, + "loss": 4.111, + "step": 5354 + }, + { + "epoch": 0.05355, + "grad_norm": 0.6051924935759244, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 5355 + }, + { + "epoch": 0.05356, + "grad_norm": 0.524816428418513, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 5356 + }, + { + "epoch": 0.05357, + "grad_norm": 0.5816829726245226, + "learning_rate": 0.003, + "loss": 4.125, + "step": 5357 + }, + { + "epoch": 0.05358, + "grad_norm": 0.5634033510447515, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 5358 + }, + { + "epoch": 0.05359, + "grad_norm": 0.6119701681573797, + "learning_rate": 0.003, + "loss": 4.1263, + "step": 5359 + }, + { + "epoch": 0.0536, + "grad_norm": 0.7787011760550901, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 5360 + }, + { + "epoch": 0.05361, + "grad_norm": 0.8465182748674693, + "learning_rate": 0.003, + "loss": 4.1364, + "step": 5361 + }, + { + "epoch": 0.05362, + "grad_norm": 0.8841512568835068, + "learning_rate": 0.003, + "loss": 4.1549, + "step": 5362 + }, + { + "epoch": 0.05363, + "grad_norm": 0.7654054247828137, + "learning_rate": 0.003, + "loss": 4.1618, + "step": 5363 + }, + { + "epoch": 0.05364, + "grad_norm": 0.6371159121774179, + "learning_rate": 0.003, + "loss": 4.1499, + "step": 5364 + }, + { + "epoch": 0.05365, + "grad_norm": 0.6825354254162556, + "learning_rate": 0.003, + "loss": 4.1455, + "step": 5365 + }, + { + "epoch": 0.05366, + "grad_norm": 0.8101026860948977, + "learning_rate": 0.003, + "loss": 4.1341, + "step": 5366 + }, + { + "epoch": 0.05367, + "grad_norm": 0.8291532467226016, + "learning_rate": 0.003, + "loss": 4.1339, + "step": 5367 + }, + { + "epoch": 0.05368, + "grad_norm": 0.8056447950741361, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 5368 + }, + { + "epoch": 0.05369, + "grad_norm": 0.8735378267002016, + "learning_rate": 0.003, + "loss": 4.1428, + "step": 5369 + }, + { + "epoch": 0.0537, + "grad_norm": 0.9391702508573958, + "learning_rate": 0.003, + "loss": 4.15, + "step": 5370 + }, + { + "epoch": 0.05371, + "grad_norm": 0.8524226126886991, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 5371 + }, + { + "epoch": 0.05372, + "grad_norm": 0.803705266443803, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 5372 + }, + { + "epoch": 0.05373, + "grad_norm": 0.7560843151563093, + "learning_rate": 0.003, + "loss": 4.1657, + "step": 5373 + }, + { + "epoch": 0.05374, + "grad_norm": 0.6930845720174758, + "learning_rate": 0.003, + "loss": 4.1398, + "step": 5374 + }, + { + "epoch": 0.05375, + "grad_norm": 0.6546702545023289, + "learning_rate": 0.003, + "loss": 4.133, + "step": 5375 + }, + { + "epoch": 0.05376, + "grad_norm": 0.6000524634204305, + "learning_rate": 0.003, + "loss": 4.1581, + "step": 5376 + }, + { + "epoch": 0.05377, + "grad_norm": 0.5834504831622397, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 5377 + }, + { + "epoch": 0.05378, + "grad_norm": 0.49712005322114705, + "learning_rate": 0.003, + "loss": 4.1435, + "step": 5378 + }, + { + "epoch": 0.05379, + "grad_norm": 0.45036859975016763, + "learning_rate": 0.003, + "loss": 4.1475, + "step": 5379 + }, + { + "epoch": 0.0538, + "grad_norm": 0.43310401465589693, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 5380 + }, + { + "epoch": 0.05381, + "grad_norm": 0.43118280545438875, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 5381 + }, + { + "epoch": 0.05382, + "grad_norm": 0.4147511601446657, + "learning_rate": 0.003, + "loss": 4.1407, + "step": 5382 + }, + { + "epoch": 0.05383, + "grad_norm": 0.41428955644326865, + "learning_rate": 0.003, + "loss": 4.1363, + "step": 5383 + }, + { + "epoch": 0.05384, + "grad_norm": 0.46974708785278296, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 5384 + }, + { + "epoch": 0.05385, + "grad_norm": 0.4804439317498876, + "learning_rate": 0.003, + "loss": 4.143, + "step": 5385 + }, + { + "epoch": 0.05386, + "grad_norm": 0.4710597747403432, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 5386 + }, + { + "epoch": 0.05387, + "grad_norm": 0.5345471173023906, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 5387 + }, + { + "epoch": 0.05388, + "grad_norm": 0.527734554101038, + "learning_rate": 0.003, + "loss": 4.1474, + "step": 5388 + }, + { + "epoch": 0.05389, + "grad_norm": 0.5157286475163666, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 5389 + }, + { + "epoch": 0.0539, + "grad_norm": 0.5289054154287074, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 5390 + }, + { + "epoch": 0.05391, + "grad_norm": 0.6349600394290038, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 5391 + }, + { + "epoch": 0.05392, + "grad_norm": 0.7287707209258841, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 5392 + }, + { + "epoch": 0.05393, + "grad_norm": 0.7525205292581727, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 5393 + }, + { + "epoch": 0.05394, + "grad_norm": 0.6536130075343365, + "learning_rate": 0.003, + "loss": 4.1161, + "step": 5394 + }, + { + "epoch": 0.05395, + "grad_norm": 0.6426311365984038, + "learning_rate": 0.003, + "loss": 4.1456, + "step": 5395 + }, + { + "epoch": 0.05396, + "grad_norm": 0.7277902979564174, + "learning_rate": 0.003, + "loss": 4.1411, + "step": 5396 + }, + { + "epoch": 0.05397, + "grad_norm": 0.7417685288107195, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 5397 + }, + { + "epoch": 0.05398, + "grad_norm": 0.7086944915328566, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 5398 + }, + { + "epoch": 0.05399, + "grad_norm": 0.714762262058065, + "learning_rate": 0.003, + "loss": 4.1439, + "step": 5399 + }, + { + "epoch": 0.054, + "grad_norm": 0.6803456287574434, + "learning_rate": 0.003, + "loss": 4.1543, + "step": 5400 + }, + { + "epoch": 0.05401, + "grad_norm": 0.7115679618160242, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 5401 + }, + { + "epoch": 0.05402, + "grad_norm": 0.6178118046647656, + "learning_rate": 0.003, + "loss": 4.1433, + "step": 5402 + }, + { + "epoch": 0.05403, + "grad_norm": 0.717834549171867, + "learning_rate": 0.003, + "loss": 4.1342, + "step": 5403 + }, + { + "epoch": 0.05404, + "grad_norm": 0.7093324185375305, + "learning_rate": 0.003, + "loss": 4.1269, + "step": 5404 + }, + { + "epoch": 0.05405, + "grad_norm": 0.6425829471366494, + "learning_rate": 0.003, + "loss": 4.1258, + "step": 5405 + }, + { + "epoch": 0.05406, + "grad_norm": 0.6272170060637055, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 5406 + }, + { + "epoch": 0.05407, + "grad_norm": 0.599920594553199, + "learning_rate": 0.003, + "loss": 4.123, + "step": 5407 + }, + { + "epoch": 0.05408, + "grad_norm": 0.5820564980718425, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 5408 + }, + { + "epoch": 0.05409, + "grad_norm": 0.5111686298731917, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 5409 + }, + { + "epoch": 0.0541, + "grad_norm": 0.462193558686699, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 5410 + }, + { + "epoch": 0.05411, + "grad_norm": 0.4130402153846359, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 5411 + }, + { + "epoch": 0.05412, + "grad_norm": 0.4056983953065427, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 5412 + }, + { + "epoch": 0.05413, + "grad_norm": 0.37874815302403264, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 5413 + }, + { + "epoch": 0.05414, + "grad_norm": 0.44844074230106556, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 5414 + }, + { + "epoch": 0.05415, + "grad_norm": 0.5721190859806238, + "learning_rate": 0.003, + "loss": 4.13, + "step": 5415 + }, + { + "epoch": 0.05416, + "grad_norm": 0.6739294583356293, + "learning_rate": 0.003, + "loss": 4.091, + "step": 5416 + }, + { + "epoch": 0.05417, + "grad_norm": 0.6903098191439943, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 5417 + }, + { + "epoch": 0.05418, + "grad_norm": 0.63417118564695, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 5418 + }, + { + "epoch": 0.05419, + "grad_norm": 0.7191816724039328, + "learning_rate": 0.003, + "loss": 4.1328, + "step": 5419 + }, + { + "epoch": 0.0542, + "grad_norm": 0.8004777313589135, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 5420 + }, + { + "epoch": 0.05421, + "grad_norm": 0.896985525441762, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 5421 + }, + { + "epoch": 0.05422, + "grad_norm": 0.7798643056239566, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 5422 + }, + { + "epoch": 0.05423, + "grad_norm": 0.7070552144440728, + "learning_rate": 0.003, + "loss": 4.1279, + "step": 5423 + }, + { + "epoch": 0.05424, + "grad_norm": 0.5579018401845559, + "learning_rate": 0.003, + "loss": 4.1542, + "step": 5424 + }, + { + "epoch": 0.05425, + "grad_norm": 0.5286925848297326, + "learning_rate": 0.003, + "loss": 4.1258, + "step": 5425 + }, + { + "epoch": 0.05426, + "grad_norm": 0.5912337560467409, + "learning_rate": 0.003, + "loss": 4.112, + "step": 5426 + }, + { + "epoch": 0.05427, + "grad_norm": 0.5997777842806946, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 5427 + }, + { + "epoch": 0.05428, + "grad_norm": 0.6476094192756237, + "learning_rate": 0.003, + "loss": 4.131, + "step": 5428 + }, + { + "epoch": 0.05429, + "grad_norm": 0.6086875232467213, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 5429 + }, + { + "epoch": 0.0543, + "grad_norm": 0.681356494329053, + "learning_rate": 0.003, + "loss": 4.1505, + "step": 5430 + }, + { + "epoch": 0.05431, + "grad_norm": 0.6615347259505286, + "learning_rate": 0.003, + "loss": 4.1237, + "step": 5431 + }, + { + "epoch": 0.05432, + "grad_norm": 0.6347016018100827, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 5432 + }, + { + "epoch": 0.05433, + "grad_norm": 0.6345480892474629, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 5433 + }, + { + "epoch": 0.05434, + "grad_norm": 0.7111690564402451, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 5434 + }, + { + "epoch": 0.05435, + "grad_norm": 0.7921322712349017, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 5435 + }, + { + "epoch": 0.05436, + "grad_norm": 0.7564267955476854, + "learning_rate": 0.003, + "loss": 4.1381, + "step": 5436 + }, + { + "epoch": 0.05437, + "grad_norm": 0.6706007148323503, + "learning_rate": 0.003, + "loss": 4.1398, + "step": 5437 + }, + { + "epoch": 0.05438, + "grad_norm": 0.6450156634525958, + "learning_rate": 0.003, + "loss": 4.1294, + "step": 5438 + }, + { + "epoch": 0.05439, + "grad_norm": 0.7387880792670661, + "learning_rate": 0.003, + "loss": 4.1225, + "step": 5439 + }, + { + "epoch": 0.0544, + "grad_norm": 0.8351260726939442, + "learning_rate": 0.003, + "loss": 4.1241, + "step": 5440 + }, + { + "epoch": 0.05441, + "grad_norm": 0.8490525937722112, + "learning_rate": 0.003, + "loss": 4.1633, + "step": 5441 + }, + { + "epoch": 0.05442, + "grad_norm": 0.674030590021176, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 5442 + }, + { + "epoch": 0.05443, + "grad_norm": 0.6505791851985235, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 5443 + }, + { + "epoch": 0.05444, + "grad_norm": 0.6106003708383616, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 5444 + }, + { + "epoch": 0.05445, + "grad_norm": 0.5632553032978198, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 5445 + }, + { + "epoch": 0.05446, + "grad_norm": 0.578137023972088, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 5446 + }, + { + "epoch": 0.05447, + "grad_norm": 0.6233499651264109, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 5447 + }, + { + "epoch": 0.05448, + "grad_norm": 0.7025863451489226, + "learning_rate": 0.003, + "loss": 4.1534, + "step": 5448 + }, + { + "epoch": 0.05449, + "grad_norm": 0.6674289251362188, + "learning_rate": 0.003, + "loss": 4.1456, + "step": 5449 + }, + { + "epoch": 0.0545, + "grad_norm": 0.68187200764662, + "learning_rate": 0.003, + "loss": 4.141, + "step": 5450 + }, + { + "epoch": 0.05451, + "grad_norm": 0.9009771762961888, + "learning_rate": 0.003, + "loss": 4.1742, + "step": 5451 + }, + { + "epoch": 0.05452, + "grad_norm": 0.9180522733634838, + "learning_rate": 0.003, + "loss": 4.1478, + "step": 5452 + }, + { + "epoch": 0.05453, + "grad_norm": 0.8476324692508597, + "learning_rate": 0.003, + "loss": 4.1392, + "step": 5453 + }, + { + "epoch": 0.05454, + "grad_norm": 0.7252925157254209, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 5454 + }, + { + "epoch": 0.05455, + "grad_norm": 0.7663075676162417, + "learning_rate": 0.003, + "loss": 4.1679, + "step": 5455 + }, + { + "epoch": 0.05456, + "grad_norm": 0.8409176235376956, + "learning_rate": 0.003, + "loss": 4.1417, + "step": 5456 + }, + { + "epoch": 0.05457, + "grad_norm": 0.8283166405126581, + "learning_rate": 0.003, + "loss": 4.1679, + "step": 5457 + }, + { + "epoch": 0.05458, + "grad_norm": 0.7230997777624161, + "learning_rate": 0.003, + "loss": 4.1382, + "step": 5458 + }, + { + "epoch": 0.05459, + "grad_norm": 0.6649973615570582, + "learning_rate": 0.003, + "loss": 4.1491, + "step": 5459 + }, + { + "epoch": 0.0546, + "grad_norm": 0.528365614611949, + "learning_rate": 0.003, + "loss": 4.1344, + "step": 5460 + }, + { + "epoch": 0.05461, + "grad_norm": 0.5520584167184678, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 5461 + }, + { + "epoch": 0.05462, + "grad_norm": 0.5391041370594502, + "learning_rate": 0.003, + "loss": 4.1598, + "step": 5462 + }, + { + "epoch": 0.05463, + "grad_norm": 0.5846088271091963, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 5463 + }, + { + "epoch": 0.05464, + "grad_norm": 0.7597079518484714, + "learning_rate": 0.003, + "loss": 4.1623, + "step": 5464 + }, + { + "epoch": 0.05465, + "grad_norm": 0.8362666805772095, + "learning_rate": 0.003, + "loss": 4.163, + "step": 5465 + }, + { + "epoch": 0.05466, + "grad_norm": 0.8383727125474473, + "learning_rate": 0.003, + "loss": 4.1732, + "step": 5466 + }, + { + "epoch": 0.05467, + "grad_norm": 0.8306791527668079, + "learning_rate": 0.003, + "loss": 4.156, + "step": 5467 + }, + { + "epoch": 0.05468, + "grad_norm": 0.6902496457265671, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 5468 + }, + { + "epoch": 0.05469, + "grad_norm": 0.6940477329883477, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 5469 + }, + { + "epoch": 0.0547, + "grad_norm": 0.6500396166893858, + "learning_rate": 0.003, + "loss": 4.1228, + "step": 5470 + }, + { + "epoch": 0.05471, + "grad_norm": 0.5274542761046104, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 5471 + }, + { + "epoch": 0.05472, + "grad_norm": 0.5294637164309897, + "learning_rate": 0.003, + "loss": 4.1204, + "step": 5472 + }, + { + "epoch": 0.05473, + "grad_norm": 0.5195706474802417, + "learning_rate": 0.003, + "loss": 4.1386, + "step": 5473 + }, + { + "epoch": 0.05474, + "grad_norm": 0.5283189608781912, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 5474 + }, + { + "epoch": 0.05475, + "grad_norm": 0.5025729117153097, + "learning_rate": 0.003, + "loss": 4.169, + "step": 5475 + }, + { + "epoch": 0.05476, + "grad_norm": 0.4403020126149031, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 5476 + }, + { + "epoch": 0.05477, + "grad_norm": 0.4121449611240764, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 5477 + }, + { + "epoch": 0.05478, + "grad_norm": 0.39409300706271555, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 5478 + }, + { + "epoch": 0.05479, + "grad_norm": 0.4508711810401754, + "learning_rate": 0.003, + "loss": 4.1255, + "step": 5479 + }, + { + "epoch": 0.0548, + "grad_norm": 0.5644355789014077, + "learning_rate": 0.003, + "loss": 4.1279, + "step": 5480 + }, + { + "epoch": 0.05481, + "grad_norm": 0.6849613721636778, + "learning_rate": 0.003, + "loss": 4.1179, + "step": 5481 + }, + { + "epoch": 0.05482, + "grad_norm": 0.7328991559368986, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 5482 + }, + { + "epoch": 0.05483, + "grad_norm": 0.6360361089895654, + "learning_rate": 0.003, + "loss": 4.114, + "step": 5483 + }, + { + "epoch": 0.05484, + "grad_norm": 0.5276598733333204, + "learning_rate": 0.003, + "loss": 4.1352, + "step": 5484 + }, + { + "epoch": 0.05485, + "grad_norm": 0.43134255974381286, + "learning_rate": 0.003, + "loss": 4.1364, + "step": 5485 + }, + { + "epoch": 0.05486, + "grad_norm": 0.49995198292073406, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 5486 + }, + { + "epoch": 0.05487, + "grad_norm": 0.5218900481549134, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 5487 + }, + { + "epoch": 0.05488, + "grad_norm": 0.6346419704768723, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 5488 + }, + { + "epoch": 0.05489, + "grad_norm": 0.7535739958723238, + "learning_rate": 0.003, + "loss": 4.1509, + "step": 5489 + }, + { + "epoch": 0.0549, + "grad_norm": 0.7585637473534005, + "learning_rate": 0.003, + "loss": 4.1253, + "step": 5490 + }, + { + "epoch": 0.05491, + "grad_norm": 0.6170687734506143, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 5491 + }, + { + "epoch": 0.05492, + "grad_norm": 0.5792989631670724, + "learning_rate": 0.003, + "loss": 4.141, + "step": 5492 + }, + { + "epoch": 0.05493, + "grad_norm": 0.5665640509165365, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 5493 + }, + { + "epoch": 0.05494, + "grad_norm": 0.6724865699209109, + "learning_rate": 0.003, + "loss": 4.1366, + "step": 5494 + }, + { + "epoch": 0.05495, + "grad_norm": 0.6552527317872832, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 5495 + }, + { + "epoch": 0.05496, + "grad_norm": 0.5714103826811914, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 5496 + }, + { + "epoch": 0.05497, + "grad_norm": 0.5522465075739208, + "learning_rate": 0.003, + "loss": 4.1321, + "step": 5497 + }, + { + "epoch": 0.05498, + "grad_norm": 0.5012005070964024, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 5498 + }, + { + "epoch": 0.05499, + "grad_norm": 0.4511204649500133, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 5499 + }, + { + "epoch": 0.055, + "grad_norm": 0.5056201476680349, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 5500 + }, + { + "epoch": 0.05501, + "grad_norm": 0.5484274718087955, + "learning_rate": 0.003, + "loss": 4.113, + "step": 5501 + }, + { + "epoch": 0.05502, + "grad_norm": 0.5221083178277119, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 5502 + }, + { + "epoch": 0.05503, + "grad_norm": 0.54151812556379, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 5503 + }, + { + "epoch": 0.05504, + "grad_norm": 0.5431761618729547, + "learning_rate": 0.003, + "loss": 4.1555, + "step": 5504 + }, + { + "epoch": 0.05505, + "grad_norm": 0.677049395087336, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 5505 + }, + { + "epoch": 0.05506, + "grad_norm": 0.9158252562242566, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 5506 + }, + { + "epoch": 0.05507, + "grad_norm": 1.1004119550566662, + "learning_rate": 0.003, + "loss": 4.1704, + "step": 5507 + }, + { + "epoch": 0.05508, + "grad_norm": 0.7690899803645341, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 5508 + }, + { + "epoch": 0.05509, + "grad_norm": 0.7409889310481907, + "learning_rate": 0.003, + "loss": 4.1318, + "step": 5509 + }, + { + "epoch": 0.0551, + "grad_norm": 0.8413828044451898, + "learning_rate": 0.003, + "loss": 4.1188, + "step": 5510 + }, + { + "epoch": 0.05511, + "grad_norm": 0.974031304462407, + "learning_rate": 0.003, + "loss": 4.1516, + "step": 5511 + }, + { + "epoch": 0.05512, + "grad_norm": 0.9261519351358342, + "learning_rate": 0.003, + "loss": 4.1574, + "step": 5512 + }, + { + "epoch": 0.05513, + "grad_norm": 0.7536753305180286, + "learning_rate": 0.003, + "loss": 4.1396, + "step": 5513 + }, + { + "epoch": 0.05514, + "grad_norm": 0.682629393213208, + "learning_rate": 0.003, + "loss": 4.1465, + "step": 5514 + }, + { + "epoch": 0.05515, + "grad_norm": 0.6438639256486987, + "learning_rate": 0.003, + "loss": 4.124, + "step": 5515 + }, + { + "epoch": 0.05516, + "grad_norm": 0.6370628235963453, + "learning_rate": 0.003, + "loss": 4.1599, + "step": 5516 + }, + { + "epoch": 0.05517, + "grad_norm": 0.6380637575916892, + "learning_rate": 0.003, + "loss": 4.117, + "step": 5517 + }, + { + "epoch": 0.05518, + "grad_norm": 0.6722144275625301, + "learning_rate": 0.003, + "loss": 4.1404, + "step": 5518 + }, + { + "epoch": 0.05519, + "grad_norm": 0.6613958013501796, + "learning_rate": 0.003, + "loss": 4.1658, + "step": 5519 + }, + { + "epoch": 0.0552, + "grad_norm": 0.5988204576269331, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 5520 + }, + { + "epoch": 0.05521, + "grad_norm": 0.6572663799653967, + "learning_rate": 0.003, + "loss": 4.1529, + "step": 5521 + }, + { + "epoch": 0.05522, + "grad_norm": 0.6243740084612656, + "learning_rate": 0.003, + "loss": 4.1628, + "step": 5522 + }, + { + "epoch": 0.05523, + "grad_norm": 0.561465995263533, + "learning_rate": 0.003, + "loss": 4.1367, + "step": 5523 + }, + { + "epoch": 0.05524, + "grad_norm": 0.5486658522043749, + "learning_rate": 0.003, + "loss": 4.1453, + "step": 5524 + }, + { + "epoch": 0.05525, + "grad_norm": 0.5695350014121137, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 5525 + }, + { + "epoch": 0.05526, + "grad_norm": 0.5346598523218599, + "learning_rate": 0.003, + "loss": 4.1162, + "step": 5526 + }, + { + "epoch": 0.05527, + "grad_norm": 0.4551658263055205, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 5527 + }, + { + "epoch": 0.05528, + "grad_norm": 0.4843464050850517, + "learning_rate": 0.003, + "loss": 4.1234, + "step": 5528 + }, + { + "epoch": 0.05529, + "grad_norm": 0.585373677911223, + "learning_rate": 0.003, + "loss": 4.1372, + "step": 5529 + }, + { + "epoch": 0.0553, + "grad_norm": 0.6825180764380449, + "learning_rate": 0.003, + "loss": 4.1496, + "step": 5530 + }, + { + "epoch": 0.05531, + "grad_norm": 0.8045417801010512, + "learning_rate": 0.003, + "loss": 4.1428, + "step": 5531 + }, + { + "epoch": 0.05532, + "grad_norm": 0.7195866881344775, + "learning_rate": 0.003, + "loss": 4.1249, + "step": 5532 + }, + { + "epoch": 0.05533, + "grad_norm": 0.6162799867320136, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 5533 + }, + { + "epoch": 0.05534, + "grad_norm": 0.7904779000302894, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 5534 + }, + { + "epoch": 0.05535, + "grad_norm": 0.8644528986049678, + "learning_rate": 0.003, + "loss": 4.1528, + "step": 5535 + }, + { + "epoch": 0.05536, + "grad_norm": 0.7766441044677856, + "learning_rate": 0.003, + "loss": 4.1412, + "step": 5536 + }, + { + "epoch": 0.05537, + "grad_norm": 0.7239299173431871, + "learning_rate": 0.003, + "loss": 4.1377, + "step": 5537 + }, + { + "epoch": 0.05538, + "grad_norm": 0.7001732822471084, + "learning_rate": 0.003, + "loss": 4.1331, + "step": 5538 + }, + { + "epoch": 0.05539, + "grad_norm": 0.6623943623113974, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 5539 + }, + { + "epoch": 0.0554, + "grad_norm": 0.6471550442576808, + "learning_rate": 0.003, + "loss": 4.1748, + "step": 5540 + }, + { + "epoch": 0.05541, + "grad_norm": 0.5868454097526585, + "learning_rate": 0.003, + "loss": 4.1489, + "step": 5541 + }, + { + "epoch": 0.05542, + "grad_norm": 0.5047287366024358, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 5542 + }, + { + "epoch": 0.05543, + "grad_norm": 0.47745524075484136, + "learning_rate": 0.003, + "loss": 4.1095, + "step": 5543 + }, + { + "epoch": 0.05544, + "grad_norm": 0.4985694332258811, + "learning_rate": 0.003, + "loss": 4.1516, + "step": 5544 + }, + { + "epoch": 0.05545, + "grad_norm": 0.5073634801765811, + "learning_rate": 0.003, + "loss": 4.1381, + "step": 5545 + }, + { + "epoch": 0.05546, + "grad_norm": 0.5237047792569028, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 5546 + }, + { + "epoch": 0.05547, + "grad_norm": 0.5914205238128549, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 5547 + }, + { + "epoch": 0.05548, + "grad_norm": 0.6165645927936478, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 5548 + }, + { + "epoch": 0.05549, + "grad_norm": 0.5898830700152652, + "learning_rate": 0.003, + "loss": 4.143, + "step": 5549 + }, + { + "epoch": 0.0555, + "grad_norm": 0.45665075102292346, + "learning_rate": 0.003, + "loss": 4.115, + "step": 5550 + }, + { + "epoch": 0.05551, + "grad_norm": 0.41856537923447845, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 5551 + }, + { + "epoch": 0.05552, + "grad_norm": 0.4227890713436711, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 5552 + }, + { + "epoch": 0.05553, + "grad_norm": 0.40800302608685, + "learning_rate": 0.003, + "loss": 4.1221, + "step": 5553 + }, + { + "epoch": 0.05554, + "grad_norm": 0.4343242066304794, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 5554 + }, + { + "epoch": 0.05555, + "grad_norm": 0.49222919452357977, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 5555 + }, + { + "epoch": 0.05556, + "grad_norm": 0.5727061854119558, + "learning_rate": 0.003, + "loss": 4.102, + "step": 5556 + }, + { + "epoch": 0.05557, + "grad_norm": 0.6430410979228637, + "learning_rate": 0.003, + "loss": 4.1312, + "step": 5557 + }, + { + "epoch": 0.05558, + "grad_norm": 0.7465449755259148, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 5558 + }, + { + "epoch": 0.05559, + "grad_norm": 0.8571034449603531, + "learning_rate": 0.003, + "loss": 4.1283, + "step": 5559 + }, + { + "epoch": 0.0556, + "grad_norm": 0.9603542514944866, + "learning_rate": 0.003, + "loss": 4.1606, + "step": 5560 + }, + { + "epoch": 0.05561, + "grad_norm": 1.1384407616016659, + "learning_rate": 0.003, + "loss": 4.1234, + "step": 5561 + }, + { + "epoch": 0.05562, + "grad_norm": 0.8542948215220858, + "learning_rate": 0.003, + "loss": 4.1323, + "step": 5562 + }, + { + "epoch": 0.05563, + "grad_norm": 0.6440539340616278, + "learning_rate": 0.003, + "loss": 4.1376, + "step": 5563 + }, + { + "epoch": 0.05564, + "grad_norm": 0.5983189844345748, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 5564 + }, + { + "epoch": 0.05565, + "grad_norm": 0.6928412710848559, + "learning_rate": 0.003, + "loss": 4.1628, + "step": 5565 + }, + { + "epoch": 0.05566, + "grad_norm": 0.7458788234271931, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 5566 + }, + { + "epoch": 0.05567, + "grad_norm": 0.6975211809831088, + "learning_rate": 0.003, + "loss": 4.1221, + "step": 5567 + }, + { + "epoch": 0.05568, + "grad_norm": 0.7797367230472155, + "learning_rate": 0.003, + "loss": 4.1575, + "step": 5568 + }, + { + "epoch": 0.05569, + "grad_norm": 0.8587086222029046, + "learning_rate": 0.003, + "loss": 4.1352, + "step": 5569 + }, + { + "epoch": 0.0557, + "grad_norm": 0.842041289187777, + "learning_rate": 0.003, + "loss": 4.1581, + "step": 5570 + }, + { + "epoch": 0.05571, + "grad_norm": 0.7876741380103197, + "learning_rate": 0.003, + "loss": 4.1478, + "step": 5571 + }, + { + "epoch": 0.05572, + "grad_norm": 0.6444705359029274, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 5572 + }, + { + "epoch": 0.05573, + "grad_norm": 0.549050192063969, + "learning_rate": 0.003, + "loss": 4.1563, + "step": 5573 + }, + { + "epoch": 0.05574, + "grad_norm": 0.5333806190349942, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 5574 + }, + { + "epoch": 0.05575, + "grad_norm": 0.5225404420588966, + "learning_rate": 0.003, + "loss": 4.1225, + "step": 5575 + }, + { + "epoch": 0.05576, + "grad_norm": 0.5429894944172927, + "learning_rate": 0.003, + "loss": 4.098, + "step": 5576 + }, + { + "epoch": 0.05577, + "grad_norm": 0.5490427675347196, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 5577 + }, + { + "epoch": 0.05578, + "grad_norm": 0.6083016266092716, + "learning_rate": 0.003, + "loss": 4.1242, + "step": 5578 + }, + { + "epoch": 0.05579, + "grad_norm": 0.6186932276076856, + "learning_rate": 0.003, + "loss": 4.1399, + "step": 5579 + }, + { + "epoch": 0.0558, + "grad_norm": 0.5780512131591367, + "learning_rate": 0.003, + "loss": 4.129, + "step": 5580 + }, + { + "epoch": 0.05581, + "grad_norm": 0.5980239436138541, + "learning_rate": 0.003, + "loss": 4.1401, + "step": 5581 + }, + { + "epoch": 0.05582, + "grad_norm": 0.5392719008352066, + "learning_rate": 0.003, + "loss": 4.1521, + "step": 5582 + }, + { + "epoch": 0.05583, + "grad_norm": 0.5546273095860996, + "learning_rate": 0.003, + "loss": 4.125, + "step": 5583 + }, + { + "epoch": 0.05584, + "grad_norm": 0.6705838265732573, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 5584 + }, + { + "epoch": 0.05585, + "grad_norm": 0.8217396804436106, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 5585 + }, + { + "epoch": 0.05586, + "grad_norm": 0.9042169206281825, + "learning_rate": 0.003, + "loss": 4.1511, + "step": 5586 + }, + { + "epoch": 0.05587, + "grad_norm": 0.9227619224934434, + "learning_rate": 0.003, + "loss": 4.1554, + "step": 5587 + }, + { + "epoch": 0.05588, + "grad_norm": 0.7777466990352064, + "learning_rate": 0.003, + "loss": 4.1271, + "step": 5588 + }, + { + "epoch": 0.05589, + "grad_norm": 0.6603780366654983, + "learning_rate": 0.003, + "loss": 4.1357, + "step": 5589 + }, + { + "epoch": 0.0559, + "grad_norm": 0.6528519374382725, + "learning_rate": 0.003, + "loss": 4.1314, + "step": 5590 + }, + { + "epoch": 0.05591, + "grad_norm": 0.6471996561956648, + "learning_rate": 0.003, + "loss": 4.1387, + "step": 5591 + }, + { + "epoch": 0.05592, + "grad_norm": 0.6641363904608714, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 5592 + }, + { + "epoch": 0.05593, + "grad_norm": 0.6321306805343749, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 5593 + }, + { + "epoch": 0.05594, + "grad_norm": 0.5652755839521619, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 5594 + }, + { + "epoch": 0.05595, + "grad_norm": 0.5052820748237068, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 5595 + }, + { + "epoch": 0.05596, + "grad_norm": 0.47170632908810217, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 5596 + }, + { + "epoch": 0.05597, + "grad_norm": 0.4937678505983004, + "learning_rate": 0.003, + "loss": 4.1396, + "step": 5597 + }, + { + "epoch": 0.05598, + "grad_norm": 0.5635649117022338, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 5598 + }, + { + "epoch": 0.05599, + "grad_norm": 0.6174104628038847, + "learning_rate": 0.003, + "loss": 4.1489, + "step": 5599 + }, + { + "epoch": 0.056, + "grad_norm": 0.6539159328500083, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 5600 + }, + { + "epoch": 0.05601, + "grad_norm": 0.6679699951089212, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 5601 + }, + { + "epoch": 0.05602, + "grad_norm": 0.6546413158566504, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 5602 + }, + { + "epoch": 0.05603, + "grad_norm": 0.6946743527150302, + "learning_rate": 0.003, + "loss": 4.115, + "step": 5603 + }, + { + "epoch": 0.05604, + "grad_norm": 0.7552162101007626, + "learning_rate": 0.003, + "loss": 4.1547, + "step": 5604 + }, + { + "epoch": 0.05605, + "grad_norm": 0.6696007064120673, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 5605 + }, + { + "epoch": 0.05606, + "grad_norm": 0.59562089846795, + "learning_rate": 0.003, + "loss": 4.1342, + "step": 5606 + }, + { + "epoch": 0.05607, + "grad_norm": 0.6269393532345678, + "learning_rate": 0.003, + "loss": 4.1513, + "step": 5607 + }, + { + "epoch": 0.05608, + "grad_norm": 0.6816772543279809, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 5608 + }, + { + "epoch": 0.05609, + "grad_norm": 0.6707354321971271, + "learning_rate": 0.003, + "loss": 4.1234, + "step": 5609 + }, + { + "epoch": 0.0561, + "grad_norm": 0.6411808035499755, + "learning_rate": 0.003, + "loss": 4.1566, + "step": 5610 + }, + { + "epoch": 0.05611, + "grad_norm": 0.6367039596675031, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 5611 + }, + { + "epoch": 0.05612, + "grad_norm": 0.6159883542316671, + "learning_rate": 0.003, + "loss": 4.117, + "step": 5612 + }, + { + "epoch": 0.05613, + "grad_norm": 0.5688379331389054, + "learning_rate": 0.003, + "loss": 4.1407, + "step": 5613 + }, + { + "epoch": 0.05614, + "grad_norm": 0.5623023558521582, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 5614 + }, + { + "epoch": 0.05615, + "grad_norm": 0.5064939834311122, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 5615 + }, + { + "epoch": 0.05616, + "grad_norm": 0.5555368115916227, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 5616 + }, + { + "epoch": 0.05617, + "grad_norm": 0.6951229882403026, + "learning_rate": 0.003, + "loss": 4.1353, + "step": 5617 + }, + { + "epoch": 0.05618, + "grad_norm": 0.7979399037113596, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 5618 + }, + { + "epoch": 0.05619, + "grad_norm": 0.7776125407615552, + "learning_rate": 0.003, + "loss": 4.1785, + "step": 5619 + }, + { + "epoch": 0.0562, + "grad_norm": 0.6817596297069429, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 5620 + }, + { + "epoch": 0.05621, + "grad_norm": 0.6549397635610529, + "learning_rate": 0.003, + "loss": 4.139, + "step": 5621 + }, + { + "epoch": 0.05622, + "grad_norm": 0.7329313978126628, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 5622 + }, + { + "epoch": 0.05623, + "grad_norm": 0.732086571615983, + "learning_rate": 0.003, + "loss": 4.1433, + "step": 5623 + }, + { + "epoch": 0.05624, + "grad_norm": 0.6771095843567011, + "learning_rate": 0.003, + "loss": 4.1599, + "step": 5624 + }, + { + "epoch": 0.05625, + "grad_norm": 0.6183411314364045, + "learning_rate": 0.003, + "loss": 4.15, + "step": 5625 + }, + { + "epoch": 0.05626, + "grad_norm": 0.6570365199694389, + "learning_rate": 0.003, + "loss": 4.1182, + "step": 5626 + }, + { + "epoch": 0.05627, + "grad_norm": 0.6639084179326356, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 5627 + }, + { + "epoch": 0.05628, + "grad_norm": 0.6844026433487909, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 5628 + }, + { + "epoch": 0.05629, + "grad_norm": 0.82713889536583, + "learning_rate": 0.003, + "loss": 4.1413, + "step": 5629 + }, + { + "epoch": 0.0563, + "grad_norm": 0.6942930662060884, + "learning_rate": 0.003, + "loss": 4.1162, + "step": 5630 + }, + { + "epoch": 0.05631, + "grad_norm": 0.5294373134589537, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 5631 + }, + { + "epoch": 0.05632, + "grad_norm": 0.49155296329990794, + "learning_rate": 0.003, + "loss": 4.1302, + "step": 5632 + }, + { + "epoch": 0.05633, + "grad_norm": 0.4835048319394899, + "learning_rate": 0.003, + "loss": 4.1263, + "step": 5633 + }, + { + "epoch": 0.05634, + "grad_norm": 0.4557319202795549, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 5634 + }, + { + "epoch": 0.05635, + "grad_norm": 0.4698037555374509, + "learning_rate": 0.003, + "loss": 4.1271, + "step": 5635 + }, + { + "epoch": 0.05636, + "grad_norm": 0.5988222861693672, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 5636 + }, + { + "epoch": 0.05637, + "grad_norm": 0.6571880685992473, + "learning_rate": 0.003, + "loss": 4.1605, + "step": 5637 + }, + { + "epoch": 0.05638, + "grad_norm": 0.589395853147304, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 5638 + }, + { + "epoch": 0.05639, + "grad_norm": 0.5433844345616452, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 5639 + }, + { + "epoch": 0.0564, + "grad_norm": 0.5170019835368442, + "learning_rate": 0.003, + "loss": 4.1235, + "step": 5640 + }, + { + "epoch": 0.05641, + "grad_norm": 0.4891316043455271, + "learning_rate": 0.003, + "loss": 4.071, + "step": 5641 + }, + { + "epoch": 0.05642, + "grad_norm": 0.4491321561168964, + "learning_rate": 0.003, + "loss": 4.139, + "step": 5642 + }, + { + "epoch": 0.05643, + "grad_norm": 0.4830092535839702, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 5643 + }, + { + "epoch": 0.05644, + "grad_norm": 0.5232904059467851, + "learning_rate": 0.003, + "loss": 4.1121, + "step": 5644 + }, + { + "epoch": 0.05645, + "grad_norm": 0.567111969651976, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 5645 + }, + { + "epoch": 0.05646, + "grad_norm": 0.6631370135003563, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 5646 + }, + { + "epoch": 0.05647, + "grad_norm": 0.8154565355116417, + "learning_rate": 0.003, + "loss": 4.125, + "step": 5647 + }, + { + "epoch": 0.05648, + "grad_norm": 0.7782448973936725, + "learning_rate": 0.003, + "loss": 4.1429, + "step": 5648 + }, + { + "epoch": 0.05649, + "grad_norm": 0.6294974249907404, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 5649 + }, + { + "epoch": 0.0565, + "grad_norm": 0.658532379840979, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 5650 + }, + { + "epoch": 0.05651, + "grad_norm": 0.5881790419238977, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 5651 + }, + { + "epoch": 0.05652, + "grad_norm": 0.6487361989272737, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 5652 + }, + { + "epoch": 0.05653, + "grad_norm": 0.890297904598085, + "learning_rate": 0.003, + "loss": 4.1452, + "step": 5653 + }, + { + "epoch": 0.05654, + "grad_norm": 1.122891362935015, + "learning_rate": 0.003, + "loss": 4.1688, + "step": 5654 + }, + { + "epoch": 0.05655, + "grad_norm": 0.7784475530833843, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 5655 + }, + { + "epoch": 0.05656, + "grad_norm": 0.7039018333153184, + "learning_rate": 0.003, + "loss": 4.1449, + "step": 5656 + }, + { + "epoch": 0.05657, + "grad_norm": 0.7611663979262661, + "learning_rate": 0.003, + "loss": 4.1451, + "step": 5657 + }, + { + "epoch": 0.05658, + "grad_norm": 0.6301443417394809, + "learning_rate": 0.003, + "loss": 4.128, + "step": 5658 + }, + { + "epoch": 0.05659, + "grad_norm": 0.5799574664434455, + "learning_rate": 0.003, + "loss": 4.1303, + "step": 5659 + }, + { + "epoch": 0.0566, + "grad_norm": 0.5398797700270416, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 5660 + }, + { + "epoch": 0.05661, + "grad_norm": 0.5607006474067805, + "learning_rate": 0.003, + "loss": 4.1688, + "step": 5661 + }, + { + "epoch": 0.05662, + "grad_norm": 0.6312047749179707, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 5662 + }, + { + "epoch": 0.05663, + "grad_norm": 0.6691547203899587, + "learning_rate": 0.003, + "loss": 4.1338, + "step": 5663 + }, + { + "epoch": 0.05664, + "grad_norm": 0.6839990390423485, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 5664 + }, + { + "epoch": 0.05665, + "grad_norm": 0.6483304454614884, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 5665 + }, + { + "epoch": 0.05666, + "grad_norm": 0.7336745114259869, + "learning_rate": 0.003, + "loss": 4.136, + "step": 5666 + }, + { + "epoch": 0.05667, + "grad_norm": 0.6601921754069777, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 5667 + }, + { + "epoch": 0.05668, + "grad_norm": 0.608008716479813, + "learning_rate": 0.003, + "loss": 4.1476, + "step": 5668 + }, + { + "epoch": 0.05669, + "grad_norm": 0.6089029787701348, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 5669 + }, + { + "epoch": 0.0567, + "grad_norm": 0.5002563109306051, + "learning_rate": 0.003, + "loss": 4.1311, + "step": 5670 + }, + { + "epoch": 0.05671, + "grad_norm": 0.5193596424869418, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 5671 + }, + { + "epoch": 0.05672, + "grad_norm": 0.6518666148309528, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 5672 + }, + { + "epoch": 0.05673, + "grad_norm": 0.7728626458240908, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 5673 + }, + { + "epoch": 0.05674, + "grad_norm": 0.8332874518712882, + "learning_rate": 0.003, + "loss": 4.1398, + "step": 5674 + }, + { + "epoch": 0.05675, + "grad_norm": 0.8323964040847633, + "learning_rate": 0.003, + "loss": 4.1313, + "step": 5675 + }, + { + "epoch": 0.05676, + "grad_norm": 0.7595324774497408, + "learning_rate": 0.003, + "loss": 4.1411, + "step": 5676 + }, + { + "epoch": 0.05677, + "grad_norm": 0.6740603533153058, + "learning_rate": 0.003, + "loss": 4.1342, + "step": 5677 + }, + { + "epoch": 0.05678, + "grad_norm": 0.5713494747040236, + "learning_rate": 0.003, + "loss": 4.1222, + "step": 5678 + }, + { + "epoch": 0.05679, + "grad_norm": 0.6253195899590996, + "learning_rate": 0.003, + "loss": 4.1404, + "step": 5679 + }, + { + "epoch": 0.0568, + "grad_norm": 0.6100710085093579, + "learning_rate": 0.003, + "loss": 4.117, + "step": 5680 + }, + { + "epoch": 0.05681, + "grad_norm": 0.6830054586678898, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 5681 + }, + { + "epoch": 0.05682, + "grad_norm": 0.7108282874400138, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 5682 + }, + { + "epoch": 0.05683, + "grad_norm": 0.5883840169923861, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 5683 + }, + { + "epoch": 0.05684, + "grad_norm": 0.6507038187659987, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 5684 + }, + { + "epoch": 0.05685, + "grad_norm": 0.6756833252452829, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 5685 + }, + { + "epoch": 0.05686, + "grad_norm": 0.583645658916334, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 5686 + }, + { + "epoch": 0.05687, + "grad_norm": 0.5953213292181491, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 5687 + }, + { + "epoch": 0.05688, + "grad_norm": 0.5475876848073131, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 5688 + }, + { + "epoch": 0.05689, + "grad_norm": 0.5025790725419514, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 5689 + }, + { + "epoch": 0.0569, + "grad_norm": 0.45433112969189393, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 5690 + }, + { + "epoch": 0.05691, + "grad_norm": 0.4140962723522923, + "learning_rate": 0.003, + "loss": 4.1404, + "step": 5691 + }, + { + "epoch": 0.05692, + "grad_norm": 0.4662306842775959, + "learning_rate": 0.003, + "loss": 4.1179, + "step": 5692 + }, + { + "epoch": 0.05693, + "grad_norm": 0.5891875363708755, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 5693 + }, + { + "epoch": 0.05694, + "grad_norm": 0.8176890746618, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 5694 + }, + { + "epoch": 0.05695, + "grad_norm": 1.0722082901443937, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 5695 + }, + { + "epoch": 0.05696, + "grad_norm": 0.9384436343645547, + "learning_rate": 0.003, + "loss": 4.1254, + "step": 5696 + }, + { + "epoch": 0.05697, + "grad_norm": 0.7353450806650836, + "learning_rate": 0.003, + "loss": 4.1609, + "step": 5697 + }, + { + "epoch": 0.05698, + "grad_norm": 0.6318836782831939, + "learning_rate": 0.003, + "loss": 4.156, + "step": 5698 + }, + { + "epoch": 0.05699, + "grad_norm": 0.5916316530186301, + "learning_rate": 0.003, + "loss": 4.1344, + "step": 5699 + }, + { + "epoch": 0.057, + "grad_norm": 0.6535089788186025, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 5700 + }, + { + "epoch": 0.05701, + "grad_norm": 0.6546023694581979, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 5701 + }, + { + "epoch": 0.05702, + "grad_norm": 0.5084570882781324, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 5702 + }, + { + "epoch": 0.05703, + "grad_norm": 0.5133248015349259, + "learning_rate": 0.003, + "loss": 4.1582, + "step": 5703 + }, + { + "epoch": 0.05704, + "grad_norm": 0.5358288934883745, + "learning_rate": 0.003, + "loss": 4.1182, + "step": 5704 + }, + { + "epoch": 0.05705, + "grad_norm": 0.5900408511498093, + "learning_rate": 0.003, + "loss": 4.1198, + "step": 5705 + }, + { + "epoch": 0.05706, + "grad_norm": 0.6627882566353759, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 5706 + }, + { + "epoch": 0.05707, + "grad_norm": 0.7192441694975358, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 5707 + }, + { + "epoch": 0.05708, + "grad_norm": 0.7445828354995511, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 5708 + }, + { + "epoch": 0.05709, + "grad_norm": 0.7483624354012154, + "learning_rate": 0.003, + "loss": 4.1312, + "step": 5709 + }, + { + "epoch": 0.0571, + "grad_norm": 0.725767615807694, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 5710 + }, + { + "epoch": 0.05711, + "grad_norm": 0.7397045380207842, + "learning_rate": 0.003, + "loss": 4.1436, + "step": 5711 + }, + { + "epoch": 0.05712, + "grad_norm": 0.7189079122457795, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 5712 + }, + { + "epoch": 0.05713, + "grad_norm": 0.7060174681207148, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 5713 + }, + { + "epoch": 0.05714, + "grad_norm": 0.6921393568332758, + "learning_rate": 0.003, + "loss": 4.1728, + "step": 5714 + }, + { + "epoch": 0.05715, + "grad_norm": 0.6750383401887041, + "learning_rate": 0.003, + "loss": 4.1752, + "step": 5715 + }, + { + "epoch": 0.05716, + "grad_norm": 0.7085886694681629, + "learning_rate": 0.003, + "loss": 4.1475, + "step": 5716 + }, + { + "epoch": 0.05717, + "grad_norm": 0.6663173016626173, + "learning_rate": 0.003, + "loss": 4.1546, + "step": 5717 + }, + { + "epoch": 0.05718, + "grad_norm": 0.7941387320047305, + "learning_rate": 0.003, + "loss": 4.1406, + "step": 5718 + }, + { + "epoch": 0.05719, + "grad_norm": 0.9177092410441985, + "learning_rate": 0.003, + "loss": 4.1523, + "step": 5719 + }, + { + "epoch": 0.0572, + "grad_norm": 0.8971727958952642, + "learning_rate": 0.003, + "loss": 4.1409, + "step": 5720 + }, + { + "epoch": 0.05721, + "grad_norm": 0.926040651918975, + "learning_rate": 0.003, + "loss": 4.1283, + "step": 5721 + }, + { + "epoch": 0.05722, + "grad_norm": 0.899008075973923, + "learning_rate": 0.003, + "loss": 4.1453, + "step": 5722 + }, + { + "epoch": 0.05723, + "grad_norm": 0.8257100704628738, + "learning_rate": 0.003, + "loss": 4.1442, + "step": 5723 + }, + { + "epoch": 0.05724, + "grad_norm": 0.7778923288997804, + "learning_rate": 0.003, + "loss": 4.1448, + "step": 5724 + }, + { + "epoch": 0.05725, + "grad_norm": 0.6413770657071006, + "learning_rate": 0.003, + "loss": 4.1505, + "step": 5725 + }, + { + "epoch": 0.05726, + "grad_norm": 0.6037306221880084, + "learning_rate": 0.003, + "loss": 4.1389, + "step": 5726 + }, + { + "epoch": 0.05727, + "grad_norm": 0.5200633077018988, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 5727 + }, + { + "epoch": 0.05728, + "grad_norm": 0.5036743411797538, + "learning_rate": 0.003, + "loss": 4.15, + "step": 5728 + }, + { + "epoch": 0.05729, + "grad_norm": 0.4985411262542399, + "learning_rate": 0.003, + "loss": 4.1271, + "step": 5729 + }, + { + "epoch": 0.0573, + "grad_norm": 0.4584917863448276, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 5730 + }, + { + "epoch": 0.05731, + "grad_norm": 0.5438247899724311, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 5731 + }, + { + "epoch": 0.05732, + "grad_norm": 0.5879044540226291, + "learning_rate": 0.003, + "loss": 4.1438, + "step": 5732 + }, + { + "epoch": 0.05733, + "grad_norm": 0.5930153226482116, + "learning_rate": 0.003, + "loss": 4.1533, + "step": 5733 + }, + { + "epoch": 0.05734, + "grad_norm": 0.551401918593653, + "learning_rate": 0.003, + "loss": 4.146, + "step": 5734 + }, + { + "epoch": 0.05735, + "grad_norm": 0.44849752852507807, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 5735 + }, + { + "epoch": 0.05736, + "grad_norm": 0.44905168397046813, + "learning_rate": 0.003, + "loss": 4.1292, + "step": 5736 + }, + { + "epoch": 0.05737, + "grad_norm": 0.5268320731054086, + "learning_rate": 0.003, + "loss": 4.124, + "step": 5737 + }, + { + "epoch": 0.05738, + "grad_norm": 0.5127400458979028, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 5738 + }, + { + "epoch": 0.05739, + "grad_norm": 0.4584064056524342, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 5739 + }, + { + "epoch": 0.0574, + "grad_norm": 0.43315600343740956, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 5740 + }, + { + "epoch": 0.05741, + "grad_norm": 0.435296931858689, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 5741 + }, + { + "epoch": 0.05742, + "grad_norm": 0.4223546407108866, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 5742 + }, + { + "epoch": 0.05743, + "grad_norm": 0.3829337526851232, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 5743 + }, + { + "epoch": 0.05744, + "grad_norm": 0.42582439817855655, + "learning_rate": 0.003, + "loss": 4.1132, + "step": 5744 + }, + { + "epoch": 0.05745, + "grad_norm": 0.5032675064997798, + "learning_rate": 0.003, + "loss": 4.1142, + "step": 5745 + }, + { + "epoch": 0.05746, + "grad_norm": 0.5371812025082381, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 5746 + }, + { + "epoch": 0.05747, + "grad_norm": 0.5691353438016382, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 5747 + }, + { + "epoch": 0.05748, + "grad_norm": 0.5609944240328593, + "learning_rate": 0.003, + "loss": 4.1463, + "step": 5748 + }, + { + "epoch": 0.05749, + "grad_norm": 0.5812063114829625, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 5749 + }, + { + "epoch": 0.0575, + "grad_norm": 0.59242400709736, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 5750 + }, + { + "epoch": 0.05751, + "grad_norm": 0.5892062367000988, + "learning_rate": 0.003, + "loss": 4.1363, + "step": 5751 + }, + { + "epoch": 0.05752, + "grad_norm": 0.6411889904390121, + "learning_rate": 0.003, + "loss": 4.125, + "step": 5752 + }, + { + "epoch": 0.05753, + "grad_norm": 0.7623604527296114, + "learning_rate": 0.003, + "loss": 4.1237, + "step": 5753 + }, + { + "epoch": 0.05754, + "grad_norm": 0.8190982577351572, + "learning_rate": 0.003, + "loss": 4.1283, + "step": 5754 + }, + { + "epoch": 0.05755, + "grad_norm": 0.8129781285023648, + "learning_rate": 0.003, + "loss": 4.1147, + "step": 5755 + }, + { + "epoch": 0.05756, + "grad_norm": 0.9645409102735437, + "learning_rate": 0.003, + "loss": 4.1549, + "step": 5756 + }, + { + "epoch": 0.05757, + "grad_norm": 1.0613767929848879, + "learning_rate": 0.003, + "loss": 4.1515, + "step": 5757 + }, + { + "epoch": 0.05758, + "grad_norm": 0.8340493918370546, + "learning_rate": 0.003, + "loss": 4.141, + "step": 5758 + }, + { + "epoch": 0.05759, + "grad_norm": 0.7501058117672132, + "learning_rate": 0.003, + "loss": 4.1371, + "step": 5759 + }, + { + "epoch": 0.0576, + "grad_norm": 0.6678453188221255, + "learning_rate": 0.003, + "loss": 4.1321, + "step": 5760 + }, + { + "epoch": 0.05761, + "grad_norm": 0.7222477111017501, + "learning_rate": 0.003, + "loss": 4.1308, + "step": 5761 + }, + { + "epoch": 0.05762, + "grad_norm": 0.8358884638373766, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 5762 + }, + { + "epoch": 0.05763, + "grad_norm": 0.9017240989057173, + "learning_rate": 0.003, + "loss": 4.1463, + "step": 5763 + }, + { + "epoch": 0.05764, + "grad_norm": 0.9525671514720745, + "learning_rate": 0.003, + "loss": 4.1454, + "step": 5764 + }, + { + "epoch": 0.05765, + "grad_norm": 0.903840898655556, + "learning_rate": 0.003, + "loss": 4.1594, + "step": 5765 + }, + { + "epoch": 0.05766, + "grad_norm": 0.841117793582558, + "learning_rate": 0.003, + "loss": 4.1584, + "step": 5766 + }, + { + "epoch": 0.05767, + "grad_norm": 0.8184231215193009, + "learning_rate": 0.003, + "loss": 4.1448, + "step": 5767 + }, + { + "epoch": 0.05768, + "grad_norm": 0.7953651176604322, + "learning_rate": 0.003, + "loss": 4.1647, + "step": 5768 + }, + { + "epoch": 0.05769, + "grad_norm": 0.8641222338040627, + "learning_rate": 0.003, + "loss": 4.1594, + "step": 5769 + }, + { + "epoch": 0.0577, + "grad_norm": 0.8267946717518797, + "learning_rate": 0.003, + "loss": 4.1629, + "step": 5770 + }, + { + "epoch": 0.05771, + "grad_norm": 0.7549731798996204, + "learning_rate": 0.003, + "loss": 4.1637, + "step": 5771 + }, + { + "epoch": 0.05772, + "grad_norm": 0.8471888716859771, + "learning_rate": 0.003, + "loss": 4.174, + "step": 5772 + }, + { + "epoch": 0.05773, + "grad_norm": 0.8004119994019282, + "learning_rate": 0.003, + "loss": 4.1698, + "step": 5773 + }, + { + "epoch": 0.05774, + "grad_norm": 0.6795800808077224, + "learning_rate": 0.003, + "loss": 4.136, + "step": 5774 + }, + { + "epoch": 0.05775, + "grad_norm": 0.6426254130546161, + "learning_rate": 0.003, + "loss": 4.14, + "step": 5775 + }, + { + "epoch": 0.05776, + "grad_norm": 0.5457159419232533, + "learning_rate": 0.003, + "loss": 4.1307, + "step": 5776 + }, + { + "epoch": 0.05777, + "grad_norm": 0.5193032203296611, + "learning_rate": 0.003, + "loss": 4.1452, + "step": 5777 + }, + { + "epoch": 0.05778, + "grad_norm": 0.5430365890766845, + "learning_rate": 0.003, + "loss": 4.1274, + "step": 5778 + }, + { + "epoch": 0.05779, + "grad_norm": 0.5514106031103018, + "learning_rate": 0.003, + "loss": 4.1516, + "step": 5779 + }, + { + "epoch": 0.0578, + "grad_norm": 0.5595076136345223, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 5780 + }, + { + "epoch": 0.05781, + "grad_norm": 0.6358454172899635, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 5781 + }, + { + "epoch": 0.05782, + "grad_norm": 0.7067244036587653, + "learning_rate": 0.003, + "loss": 4.1522, + "step": 5782 + }, + { + "epoch": 0.05783, + "grad_norm": 0.6579510175517927, + "learning_rate": 0.003, + "loss": 4.094, + "step": 5783 + }, + { + "epoch": 0.05784, + "grad_norm": 0.4615349795672166, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 5784 + }, + { + "epoch": 0.05785, + "grad_norm": 0.40423330402508734, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 5785 + }, + { + "epoch": 0.05786, + "grad_norm": 0.4219551145978536, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 5786 + }, + { + "epoch": 0.05787, + "grad_norm": 0.47190796765263787, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 5787 + }, + { + "epoch": 0.05788, + "grad_norm": 0.48647627634592133, + "learning_rate": 0.003, + "loss": 4.1161, + "step": 5788 + }, + { + "epoch": 0.05789, + "grad_norm": 0.45916564143367755, + "learning_rate": 0.003, + "loss": 4.1164, + "step": 5789 + }, + { + "epoch": 0.0579, + "grad_norm": 0.38438157656491667, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 5790 + }, + { + "epoch": 0.05791, + "grad_norm": 0.380767444456616, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 5791 + }, + { + "epoch": 0.05792, + "grad_norm": 0.39494151370656566, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 5792 + }, + { + "epoch": 0.05793, + "grad_norm": 0.4521836450575782, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 5793 + }, + { + "epoch": 0.05794, + "grad_norm": 0.5585408775581093, + "learning_rate": 0.003, + "loss": 4.1152, + "step": 5794 + }, + { + "epoch": 0.05795, + "grad_norm": 0.7037387927731176, + "learning_rate": 0.003, + "loss": 4.1517, + "step": 5795 + }, + { + "epoch": 0.05796, + "grad_norm": 0.7512006570045856, + "learning_rate": 0.003, + "loss": 4.1229, + "step": 5796 + }, + { + "epoch": 0.05797, + "grad_norm": 0.6139515253192398, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 5797 + }, + { + "epoch": 0.05798, + "grad_norm": 0.42741457828347207, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 5798 + }, + { + "epoch": 0.05799, + "grad_norm": 0.4697070878309648, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 5799 + }, + { + "epoch": 0.058, + "grad_norm": 0.5718953135900179, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 5800 + }, + { + "epoch": 0.05801, + "grad_norm": 0.6101112337124837, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 5801 + }, + { + "epoch": 0.05802, + "grad_norm": 0.518362856862478, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 5802 + }, + { + "epoch": 0.05803, + "grad_norm": 0.5319512529198976, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 5803 + }, + { + "epoch": 0.05804, + "grad_norm": 0.583888596545647, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 5804 + }, + { + "epoch": 0.05805, + "grad_norm": 0.6332538337272945, + "learning_rate": 0.003, + "loss": 4.086, + "step": 5805 + }, + { + "epoch": 0.05806, + "grad_norm": 0.6591146413634413, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 5806 + }, + { + "epoch": 0.05807, + "grad_norm": 0.5648403140301973, + "learning_rate": 0.003, + "loss": 4.1258, + "step": 5807 + }, + { + "epoch": 0.05808, + "grad_norm": 0.5082305082390789, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 5808 + }, + { + "epoch": 0.05809, + "grad_norm": 0.45869354878914337, + "learning_rate": 0.003, + "loss": 4.136, + "step": 5809 + }, + { + "epoch": 0.0581, + "grad_norm": 0.5128621779633227, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 5810 + }, + { + "epoch": 0.05811, + "grad_norm": 0.4634974372718931, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 5811 + }, + { + "epoch": 0.05812, + "grad_norm": 0.5415369741433366, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 5812 + }, + { + "epoch": 0.05813, + "grad_norm": 0.6045041038302872, + "learning_rate": 0.003, + "loss": 4.1169, + "step": 5813 + }, + { + "epoch": 0.05814, + "grad_norm": 0.781348878698672, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 5814 + }, + { + "epoch": 0.05815, + "grad_norm": 0.9049868579609852, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 5815 + }, + { + "epoch": 0.05816, + "grad_norm": 0.8496790014644907, + "learning_rate": 0.003, + "loss": 4.1366, + "step": 5816 + }, + { + "epoch": 0.05817, + "grad_norm": 0.6848842285781443, + "learning_rate": 0.003, + "loss": 4.1451, + "step": 5817 + }, + { + "epoch": 0.05818, + "grad_norm": 0.6972808127885899, + "learning_rate": 0.003, + "loss": 4.1262, + "step": 5818 + }, + { + "epoch": 0.05819, + "grad_norm": 0.7928565409685948, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 5819 + }, + { + "epoch": 0.0582, + "grad_norm": 0.8531050086329389, + "learning_rate": 0.003, + "loss": 4.1317, + "step": 5820 + }, + { + "epoch": 0.05821, + "grad_norm": 0.8181521585974338, + "learning_rate": 0.003, + "loss": 4.1487, + "step": 5821 + }, + { + "epoch": 0.05822, + "grad_norm": 0.6685069393486631, + "learning_rate": 0.003, + "loss": 4.103, + "step": 5822 + }, + { + "epoch": 0.05823, + "grad_norm": 0.7178511889820098, + "learning_rate": 0.003, + "loss": 4.138, + "step": 5823 + }, + { + "epoch": 0.05824, + "grad_norm": 0.7316138274504331, + "learning_rate": 0.003, + "loss": 4.1395, + "step": 5824 + }, + { + "epoch": 0.05825, + "grad_norm": 0.7577488477174734, + "learning_rate": 0.003, + "loss": 4.1663, + "step": 5825 + }, + { + "epoch": 0.05826, + "grad_norm": 0.7352824443915027, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 5826 + }, + { + "epoch": 0.05827, + "grad_norm": 0.737466448111901, + "learning_rate": 0.003, + "loss": 4.1413, + "step": 5827 + }, + { + "epoch": 0.05828, + "grad_norm": 0.5756572073483629, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 5828 + }, + { + "epoch": 0.05829, + "grad_norm": 0.5700830450607922, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 5829 + }, + { + "epoch": 0.0583, + "grad_norm": 0.6090643389167884, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 5830 + }, + { + "epoch": 0.05831, + "grad_norm": 0.6598337613053546, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 5831 + }, + { + "epoch": 0.05832, + "grad_norm": 0.7236800212652943, + "learning_rate": 0.003, + "loss": 4.1451, + "step": 5832 + }, + { + "epoch": 0.05833, + "grad_norm": 0.8312662391240169, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 5833 + }, + { + "epoch": 0.05834, + "grad_norm": 0.8570982085297809, + "learning_rate": 0.003, + "loss": 4.1458, + "step": 5834 + }, + { + "epoch": 0.05835, + "grad_norm": 0.7634242055631113, + "learning_rate": 0.003, + "loss": 4.1634, + "step": 5835 + }, + { + "epoch": 0.05836, + "grad_norm": 0.8129339709531613, + "learning_rate": 0.003, + "loss": 4.1529, + "step": 5836 + }, + { + "epoch": 0.05837, + "grad_norm": 0.7114562019507951, + "learning_rate": 0.003, + "loss": 4.1424, + "step": 5837 + }, + { + "epoch": 0.05838, + "grad_norm": 0.7146446360282164, + "learning_rate": 0.003, + "loss": 4.144, + "step": 5838 + }, + { + "epoch": 0.05839, + "grad_norm": 0.6826153272700018, + "learning_rate": 0.003, + "loss": 4.1438, + "step": 5839 + }, + { + "epoch": 0.0584, + "grad_norm": 0.6296145264005895, + "learning_rate": 0.003, + "loss": 4.1591, + "step": 5840 + }, + { + "epoch": 0.05841, + "grad_norm": 0.6400001378596765, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 5841 + }, + { + "epoch": 0.05842, + "grad_norm": 0.6745147859755771, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 5842 + }, + { + "epoch": 0.05843, + "grad_norm": 0.7204687011170773, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 5843 + }, + { + "epoch": 0.05844, + "grad_norm": 0.6692907850570987, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 5844 + }, + { + "epoch": 0.05845, + "grad_norm": 0.6464847846170348, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 5845 + }, + { + "epoch": 0.05846, + "grad_norm": 0.5903179636360006, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 5846 + }, + { + "epoch": 0.05847, + "grad_norm": 0.5715707353163612, + "learning_rate": 0.003, + "loss": 4.147, + "step": 5847 + }, + { + "epoch": 0.05848, + "grad_norm": 0.6188778057459394, + "learning_rate": 0.003, + "loss": 4.1234, + "step": 5848 + }, + { + "epoch": 0.05849, + "grad_norm": 0.679788170958821, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 5849 + }, + { + "epoch": 0.0585, + "grad_norm": 0.6866570136156571, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 5850 + }, + { + "epoch": 0.05851, + "grad_norm": 0.6035947467507179, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 5851 + }, + { + "epoch": 0.05852, + "grad_norm": 0.6646374634051921, + "learning_rate": 0.003, + "loss": 4.114, + "step": 5852 + }, + { + "epoch": 0.05853, + "grad_norm": 0.6845982684639399, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 5853 + }, + { + "epoch": 0.05854, + "grad_norm": 0.6479252813025695, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 5854 + }, + { + "epoch": 0.05855, + "grad_norm": 0.729313342535791, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 5855 + }, + { + "epoch": 0.05856, + "grad_norm": 0.8374311171433904, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 5856 + }, + { + "epoch": 0.05857, + "grad_norm": 0.8450077468218878, + "learning_rate": 0.003, + "loss": 4.1433, + "step": 5857 + }, + { + "epoch": 0.05858, + "grad_norm": 0.7554133376301935, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 5858 + }, + { + "epoch": 0.05859, + "grad_norm": 0.6288985709272776, + "learning_rate": 0.003, + "loss": 4.1314, + "step": 5859 + }, + { + "epoch": 0.0586, + "grad_norm": 0.6807746349622874, + "learning_rate": 0.003, + "loss": 4.1361, + "step": 5860 + }, + { + "epoch": 0.05861, + "grad_norm": 0.6685473536685101, + "learning_rate": 0.003, + "loss": 4.1524, + "step": 5861 + }, + { + "epoch": 0.05862, + "grad_norm": 0.5947123918251042, + "learning_rate": 0.003, + "loss": 4.1412, + "step": 5862 + }, + { + "epoch": 0.05863, + "grad_norm": 0.6529519804727131, + "learning_rate": 0.003, + "loss": 4.1397, + "step": 5863 + }, + { + "epoch": 0.05864, + "grad_norm": 0.7095951129146904, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 5864 + }, + { + "epoch": 0.05865, + "grad_norm": 0.69339783930652, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 5865 + }, + { + "epoch": 0.05866, + "grad_norm": 0.6690878299187139, + "learning_rate": 0.003, + "loss": 4.1362, + "step": 5866 + }, + { + "epoch": 0.05867, + "grad_norm": 0.7086863427678343, + "learning_rate": 0.003, + "loss": 4.1189, + "step": 5867 + }, + { + "epoch": 0.05868, + "grad_norm": 0.6786171909040684, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 5868 + }, + { + "epoch": 0.05869, + "grad_norm": 0.5232033904488403, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 5869 + }, + { + "epoch": 0.0587, + "grad_norm": 0.47022549323018026, + "learning_rate": 0.003, + "loss": 4.109, + "step": 5870 + }, + { + "epoch": 0.05871, + "grad_norm": 0.44108238286000917, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 5871 + }, + { + "epoch": 0.05872, + "grad_norm": 0.3913734511800551, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 5872 + }, + { + "epoch": 0.05873, + "grad_norm": 0.42138434666249186, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 5873 + }, + { + "epoch": 0.05874, + "grad_norm": 0.4288958519233743, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 5874 + }, + { + "epoch": 0.05875, + "grad_norm": 0.44577682626719106, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 5875 + }, + { + "epoch": 0.05876, + "grad_norm": 0.4510573231336446, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 5876 + }, + { + "epoch": 0.05877, + "grad_norm": 0.43564108247460703, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 5877 + }, + { + "epoch": 0.05878, + "grad_norm": 0.5419890900941077, + "learning_rate": 0.003, + "loss": 4.1323, + "step": 5878 + }, + { + "epoch": 0.05879, + "grad_norm": 0.6244220397965549, + "learning_rate": 0.003, + "loss": 4.1481, + "step": 5879 + }, + { + "epoch": 0.0588, + "grad_norm": 0.7402086193529186, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 5880 + }, + { + "epoch": 0.05881, + "grad_norm": 0.9386524101489326, + "learning_rate": 0.003, + "loss": 4.1251, + "step": 5881 + }, + { + "epoch": 0.05882, + "grad_norm": 0.9522508870914118, + "learning_rate": 0.003, + "loss": 4.1512, + "step": 5882 + }, + { + "epoch": 0.05883, + "grad_norm": 0.8075223632251547, + "learning_rate": 0.003, + "loss": 4.1392, + "step": 5883 + }, + { + "epoch": 0.05884, + "grad_norm": 0.8799909048791514, + "learning_rate": 0.003, + "loss": 4.1323, + "step": 5884 + }, + { + "epoch": 0.05885, + "grad_norm": 0.8530787571893335, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 5885 + }, + { + "epoch": 0.05886, + "grad_norm": 0.7329103122991582, + "learning_rate": 0.003, + "loss": 4.1364, + "step": 5886 + }, + { + "epoch": 0.05887, + "grad_norm": 0.6748479851298841, + "learning_rate": 0.003, + "loss": 4.1271, + "step": 5887 + }, + { + "epoch": 0.05888, + "grad_norm": 0.6343669394015016, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 5888 + }, + { + "epoch": 0.05889, + "grad_norm": 0.6277512686474859, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 5889 + }, + { + "epoch": 0.0589, + "grad_norm": 0.5871377194165454, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 5890 + }, + { + "epoch": 0.05891, + "grad_norm": 0.6464959924160834, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 5891 + }, + { + "epoch": 0.05892, + "grad_norm": 0.666522134105091, + "learning_rate": 0.003, + "loss": 4.1519, + "step": 5892 + }, + { + "epoch": 0.05893, + "grad_norm": 0.6842219549286493, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 5893 + }, + { + "epoch": 0.05894, + "grad_norm": 0.6497740081031832, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 5894 + }, + { + "epoch": 0.05895, + "grad_norm": 0.6324488507002164, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 5895 + }, + { + "epoch": 0.05896, + "grad_norm": 0.6784791219839371, + "learning_rate": 0.003, + "loss": 4.1405, + "step": 5896 + }, + { + "epoch": 0.05897, + "grad_norm": 0.633556875627449, + "learning_rate": 0.003, + "loss": 4.1416, + "step": 5897 + }, + { + "epoch": 0.05898, + "grad_norm": 0.7611480962202484, + "learning_rate": 0.003, + "loss": 4.1402, + "step": 5898 + }, + { + "epoch": 0.05899, + "grad_norm": 0.8755081899697468, + "learning_rate": 0.003, + "loss": 4.1115, + "step": 5899 + }, + { + "epoch": 0.059, + "grad_norm": 0.7092368456575846, + "learning_rate": 0.003, + "loss": 4.1317, + "step": 5900 + }, + { + "epoch": 0.05901, + "grad_norm": 0.7082980339584151, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 5901 + }, + { + "epoch": 0.05902, + "grad_norm": 0.5794941356444853, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 5902 + }, + { + "epoch": 0.05903, + "grad_norm": 0.6148369105501363, + "learning_rate": 0.003, + "loss": 4.1347, + "step": 5903 + }, + { + "epoch": 0.05904, + "grad_norm": 0.49903767340028915, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 5904 + }, + { + "epoch": 0.05905, + "grad_norm": 0.5199067308874674, + "learning_rate": 0.003, + "loss": 4.1146, + "step": 5905 + }, + { + "epoch": 0.05906, + "grad_norm": 0.555958949304611, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 5906 + }, + { + "epoch": 0.05907, + "grad_norm": 0.6558790760046532, + "learning_rate": 0.003, + "loss": 4.108, + "step": 5907 + }, + { + "epoch": 0.05908, + "grad_norm": 0.7027699977426964, + "learning_rate": 0.003, + "loss": 4.1306, + "step": 5908 + }, + { + "epoch": 0.05909, + "grad_norm": 0.695163190283452, + "learning_rate": 0.003, + "loss": 4.1412, + "step": 5909 + }, + { + "epoch": 0.0591, + "grad_norm": 0.6843743312475742, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 5910 + }, + { + "epoch": 0.05911, + "grad_norm": 0.8025018895941953, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 5911 + }, + { + "epoch": 0.05912, + "grad_norm": 0.7607180090206141, + "learning_rate": 0.003, + "loss": 4.1517, + "step": 5912 + }, + { + "epoch": 0.05913, + "grad_norm": 0.6684077408320309, + "learning_rate": 0.003, + "loss": 4.1703, + "step": 5913 + }, + { + "epoch": 0.05914, + "grad_norm": 0.5315953517109148, + "learning_rate": 0.003, + "loss": 4.1367, + "step": 5914 + }, + { + "epoch": 0.05915, + "grad_norm": 0.5149999875273275, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 5915 + }, + { + "epoch": 0.05916, + "grad_norm": 0.5083952172925276, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 5916 + }, + { + "epoch": 0.05917, + "grad_norm": 0.6049621514969292, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 5917 + }, + { + "epoch": 0.05918, + "grad_norm": 0.6835808900733502, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 5918 + }, + { + "epoch": 0.05919, + "grad_norm": 0.6391964983836171, + "learning_rate": 0.003, + "loss": 4.1145, + "step": 5919 + }, + { + "epoch": 0.0592, + "grad_norm": 0.6143559484021399, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 5920 + }, + { + "epoch": 0.05921, + "grad_norm": 0.5979521195261379, + "learning_rate": 0.003, + "loss": 4.1331, + "step": 5921 + }, + { + "epoch": 0.05922, + "grad_norm": 0.5693281868266068, + "learning_rate": 0.003, + "loss": 4.1239, + "step": 5922 + }, + { + "epoch": 0.05923, + "grad_norm": 0.4849615069695424, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 5923 + }, + { + "epoch": 0.05924, + "grad_norm": 0.46280219229994835, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 5924 + }, + { + "epoch": 0.05925, + "grad_norm": 0.48491396846071155, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 5925 + }, + { + "epoch": 0.05926, + "grad_norm": 0.4958640158739645, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 5926 + }, + { + "epoch": 0.05927, + "grad_norm": 0.4751378170054991, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 5927 + }, + { + "epoch": 0.05928, + "grad_norm": 0.4782244181945609, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 5928 + }, + { + "epoch": 0.05929, + "grad_norm": 0.5398915288946496, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 5929 + }, + { + "epoch": 0.0593, + "grad_norm": 0.5515514495551406, + "learning_rate": 0.003, + "loss": 4.123, + "step": 5930 + }, + { + "epoch": 0.05931, + "grad_norm": 0.5294701484364319, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 5931 + }, + { + "epoch": 0.05932, + "grad_norm": 0.636944020483588, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 5932 + }, + { + "epoch": 0.05933, + "grad_norm": 0.7543213645563572, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 5933 + }, + { + "epoch": 0.05934, + "grad_norm": 0.866274898856157, + "learning_rate": 0.003, + "loss": 4.153, + "step": 5934 + }, + { + "epoch": 0.05935, + "grad_norm": 0.8843523390695053, + "learning_rate": 0.003, + "loss": 4.1379, + "step": 5935 + }, + { + "epoch": 0.05936, + "grad_norm": 0.8578932374136042, + "learning_rate": 0.003, + "loss": 4.1359, + "step": 5936 + }, + { + "epoch": 0.05937, + "grad_norm": 0.9791083661921728, + "learning_rate": 0.003, + "loss": 4.1452, + "step": 5937 + }, + { + "epoch": 0.05938, + "grad_norm": 1.091849551063209, + "learning_rate": 0.003, + "loss": 4.1333, + "step": 5938 + }, + { + "epoch": 0.05939, + "grad_norm": 0.8366778414432169, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 5939 + }, + { + "epoch": 0.0594, + "grad_norm": 0.8548416780890227, + "learning_rate": 0.003, + "loss": 4.1619, + "step": 5940 + }, + { + "epoch": 0.05941, + "grad_norm": 0.8185445127313269, + "learning_rate": 0.003, + "loss": 4.1571, + "step": 5941 + }, + { + "epoch": 0.05942, + "grad_norm": 0.7989954573855615, + "learning_rate": 0.003, + "loss": 4.151, + "step": 5942 + }, + { + "epoch": 0.05943, + "grad_norm": 0.6699215014203319, + "learning_rate": 0.003, + "loss": 4.1347, + "step": 5943 + }, + { + "epoch": 0.05944, + "grad_norm": 0.6793053946810763, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 5944 + }, + { + "epoch": 0.05945, + "grad_norm": 0.594609731292735, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 5945 + }, + { + "epoch": 0.05946, + "grad_norm": 0.6220089630527496, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 5946 + }, + { + "epoch": 0.05947, + "grad_norm": 0.6014765437282273, + "learning_rate": 0.003, + "loss": 4.118, + "step": 5947 + }, + { + "epoch": 0.05948, + "grad_norm": 0.5605950560039095, + "learning_rate": 0.003, + "loss": 4.1307, + "step": 5948 + }, + { + "epoch": 0.05949, + "grad_norm": 0.627701504083057, + "learning_rate": 0.003, + "loss": 4.121, + "step": 5949 + }, + { + "epoch": 0.0595, + "grad_norm": 0.825535661839418, + "learning_rate": 0.003, + "loss": 4.1392, + "step": 5950 + }, + { + "epoch": 0.05951, + "grad_norm": 1.016734954201298, + "learning_rate": 0.003, + "loss": 4.1371, + "step": 5951 + }, + { + "epoch": 0.05952, + "grad_norm": 0.9844508366684808, + "learning_rate": 0.003, + "loss": 4.1538, + "step": 5952 + }, + { + "epoch": 0.05953, + "grad_norm": 0.7676164320052801, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 5953 + }, + { + "epoch": 0.05954, + "grad_norm": 0.6525936684271212, + "learning_rate": 0.003, + "loss": 4.1454, + "step": 5954 + }, + { + "epoch": 0.05955, + "grad_norm": 0.6209927774101155, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 5955 + }, + { + "epoch": 0.05956, + "grad_norm": 0.5271299792696181, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 5956 + }, + { + "epoch": 0.05957, + "grad_norm": 0.4821600312349904, + "learning_rate": 0.003, + "loss": 4.108, + "step": 5957 + }, + { + "epoch": 0.05958, + "grad_norm": 0.4508961273097574, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 5958 + }, + { + "epoch": 0.05959, + "grad_norm": 0.39637824515281267, + "learning_rate": 0.003, + "loss": 4.1153, + "step": 5959 + }, + { + "epoch": 0.0596, + "grad_norm": 0.38874299684571356, + "learning_rate": 0.003, + "loss": 4.1307, + "step": 5960 + }, + { + "epoch": 0.05961, + "grad_norm": 0.3847016903534145, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 5961 + }, + { + "epoch": 0.05962, + "grad_norm": 0.5209058986494142, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 5962 + }, + { + "epoch": 0.05963, + "grad_norm": 0.6475184427595679, + "learning_rate": 0.003, + "loss": 4.1222, + "step": 5963 + }, + { + "epoch": 0.05964, + "grad_norm": 0.8684223155180772, + "learning_rate": 0.003, + "loss": 4.1237, + "step": 5964 + }, + { + "epoch": 0.05965, + "grad_norm": 0.9048617493837324, + "learning_rate": 0.003, + "loss": 4.1268, + "step": 5965 + }, + { + "epoch": 0.05966, + "grad_norm": 0.6837588042230079, + "learning_rate": 0.003, + "loss": 4.1349, + "step": 5966 + }, + { + "epoch": 0.05967, + "grad_norm": 0.7348291814277423, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 5967 + }, + { + "epoch": 0.05968, + "grad_norm": 0.8050791089461226, + "learning_rate": 0.003, + "loss": 4.1228, + "step": 5968 + }, + { + "epoch": 0.05969, + "grad_norm": 0.6869212130630161, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 5969 + }, + { + "epoch": 0.0597, + "grad_norm": 0.6307870695677188, + "learning_rate": 0.003, + "loss": 4.1122, + "step": 5970 + }, + { + "epoch": 0.05971, + "grad_norm": 0.5810377904286976, + "learning_rate": 0.003, + "loss": 4.1311, + "step": 5971 + }, + { + "epoch": 0.05972, + "grad_norm": 0.6021740778839361, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 5972 + }, + { + "epoch": 0.05973, + "grad_norm": 0.6228448900216191, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 5973 + }, + { + "epoch": 0.05974, + "grad_norm": 0.6655393691311872, + "learning_rate": 0.003, + "loss": 4.1, + "step": 5974 + }, + { + "epoch": 0.05975, + "grad_norm": 0.7310132470202124, + "learning_rate": 0.003, + "loss": 4.1415, + "step": 5975 + }, + { + "epoch": 0.05976, + "grad_norm": 0.644416431543167, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 5976 + }, + { + "epoch": 0.05977, + "grad_norm": 0.5496400386930255, + "learning_rate": 0.003, + "loss": 4.1279, + "step": 5977 + }, + { + "epoch": 0.05978, + "grad_norm": 0.5085905535105046, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 5978 + }, + { + "epoch": 0.05979, + "grad_norm": 0.4862541012980576, + "learning_rate": 0.003, + "loss": 4.1165, + "step": 5979 + }, + { + "epoch": 0.0598, + "grad_norm": 0.4826350144452677, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 5980 + }, + { + "epoch": 0.05981, + "grad_norm": 0.4741196340897285, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 5981 + }, + { + "epoch": 0.05982, + "grad_norm": 0.46060363145123095, + "learning_rate": 0.003, + "loss": 4.109, + "step": 5982 + }, + { + "epoch": 0.05983, + "grad_norm": 0.4820067683683615, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 5983 + }, + { + "epoch": 0.05984, + "grad_norm": 0.4691794530046038, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 5984 + }, + { + "epoch": 0.05985, + "grad_norm": 0.43934894841255706, + "learning_rate": 0.003, + "loss": 4.1228, + "step": 5985 + }, + { + "epoch": 0.05986, + "grad_norm": 0.47930129072684313, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 5986 + }, + { + "epoch": 0.05987, + "grad_norm": 0.5100592025533284, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 5987 + }, + { + "epoch": 0.05988, + "grad_norm": 0.4911602197245711, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 5988 + }, + { + "epoch": 0.05989, + "grad_norm": 0.537756575806154, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 5989 + }, + { + "epoch": 0.0599, + "grad_norm": 0.6185246945024333, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 5990 + }, + { + "epoch": 0.05991, + "grad_norm": 0.6911938315101449, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 5991 + }, + { + "epoch": 0.05992, + "grad_norm": 0.7869203480784628, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 5992 + }, + { + "epoch": 0.05993, + "grad_norm": 0.8486504973239845, + "learning_rate": 0.003, + "loss": 4.1285, + "step": 5993 + }, + { + "epoch": 0.05994, + "grad_norm": 0.7035449677364836, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 5994 + }, + { + "epoch": 0.05995, + "grad_norm": 0.6451536905550219, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 5995 + }, + { + "epoch": 0.05996, + "grad_norm": 0.5969532336692085, + "learning_rate": 0.003, + "loss": 4.1433, + "step": 5996 + }, + { + "epoch": 0.05997, + "grad_norm": 0.5888895798721049, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 5997 + }, + { + "epoch": 0.05998, + "grad_norm": 0.5821959514538432, + "learning_rate": 0.003, + "loss": 4.138, + "step": 5998 + }, + { + "epoch": 0.05999, + "grad_norm": 0.6236789871480475, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 5999 + }, + { + "epoch": 0.06, + "grad_norm": 0.6956596076788343, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 6000 + }, + { + "epoch": 0.06001, + "grad_norm": 0.6802540034736559, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 6001 + }, + { + "epoch": 0.06002, + "grad_norm": 0.6479478065752834, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 6002 + }, + { + "epoch": 0.06003, + "grad_norm": 0.6764473408981168, + "learning_rate": 0.003, + "loss": 4.1254, + "step": 6003 + }, + { + "epoch": 0.06004, + "grad_norm": 0.7085851864172142, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 6004 + }, + { + "epoch": 0.06005, + "grad_norm": 0.8647819122756384, + "learning_rate": 0.003, + "loss": 4.1269, + "step": 6005 + }, + { + "epoch": 0.06006, + "grad_norm": 0.9638548780117764, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 6006 + }, + { + "epoch": 0.06007, + "grad_norm": 0.8220903209106395, + "learning_rate": 0.003, + "loss": 4.1595, + "step": 6007 + }, + { + "epoch": 0.06008, + "grad_norm": 0.7201290584455559, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 6008 + }, + { + "epoch": 0.06009, + "grad_norm": 0.6481918637846485, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 6009 + }, + { + "epoch": 0.0601, + "grad_norm": 0.6606383295563236, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 6010 + }, + { + "epoch": 0.06011, + "grad_norm": 0.6045842842765333, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 6011 + }, + { + "epoch": 0.06012, + "grad_norm": 0.5311879872224512, + "learning_rate": 0.003, + "loss": 4.1304, + "step": 6012 + }, + { + "epoch": 0.06013, + "grad_norm": 0.5661934584861681, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 6013 + }, + { + "epoch": 0.06014, + "grad_norm": 0.5103102427525318, + "learning_rate": 0.003, + "loss": 4.1094, + "step": 6014 + }, + { + "epoch": 0.06015, + "grad_norm": 0.5332466894899932, + "learning_rate": 0.003, + "loss": 4.1251, + "step": 6015 + }, + { + "epoch": 0.06016, + "grad_norm": 0.5814846632342617, + "learning_rate": 0.003, + "loss": 4.128, + "step": 6016 + }, + { + "epoch": 0.06017, + "grad_norm": 0.5584668704349169, + "learning_rate": 0.003, + "loss": 4.131, + "step": 6017 + }, + { + "epoch": 0.06018, + "grad_norm": 0.6487569147119433, + "learning_rate": 0.003, + "loss": 4.1498, + "step": 6018 + }, + { + "epoch": 0.06019, + "grad_norm": 0.8160505827483635, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 6019 + }, + { + "epoch": 0.0602, + "grad_norm": 0.9505286189725469, + "learning_rate": 0.003, + "loss": 4.1265, + "step": 6020 + }, + { + "epoch": 0.06021, + "grad_norm": 0.9008202254968174, + "learning_rate": 0.003, + "loss": 4.1474, + "step": 6021 + }, + { + "epoch": 0.06022, + "grad_norm": 0.8318378400927622, + "learning_rate": 0.003, + "loss": 4.1488, + "step": 6022 + }, + { + "epoch": 0.06023, + "grad_norm": 0.8535039615699999, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 6023 + }, + { + "epoch": 0.06024, + "grad_norm": 0.7266061447147755, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 6024 + }, + { + "epoch": 0.06025, + "grad_norm": 0.6534591173507039, + "learning_rate": 0.003, + "loss": 4.167, + "step": 6025 + }, + { + "epoch": 0.06026, + "grad_norm": 0.6882074560691404, + "learning_rate": 0.003, + "loss": 4.1332, + "step": 6026 + }, + { + "epoch": 0.06027, + "grad_norm": 0.6152833498862915, + "learning_rate": 0.003, + "loss": 4.1254, + "step": 6027 + }, + { + "epoch": 0.06028, + "grad_norm": 0.6917892126475053, + "learning_rate": 0.003, + "loss": 4.1526, + "step": 6028 + }, + { + "epoch": 0.06029, + "grad_norm": 0.7744787029514602, + "learning_rate": 0.003, + "loss": 4.1476, + "step": 6029 + }, + { + "epoch": 0.0603, + "grad_norm": 0.7298002661604193, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 6030 + }, + { + "epoch": 0.06031, + "grad_norm": 0.6529802283746298, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 6031 + }, + { + "epoch": 0.06032, + "grad_norm": 0.663361567248902, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 6032 + }, + { + "epoch": 0.06033, + "grad_norm": 0.6680068753946955, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 6033 + }, + { + "epoch": 0.06034, + "grad_norm": 0.5886111324153055, + "learning_rate": 0.003, + "loss": 4.1497, + "step": 6034 + }, + { + "epoch": 0.06035, + "grad_norm": 0.5817861329776778, + "learning_rate": 0.003, + "loss": 4.14, + "step": 6035 + }, + { + "epoch": 0.06036, + "grad_norm": 0.5937836039304831, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 6036 + }, + { + "epoch": 0.06037, + "grad_norm": 0.5457734592349858, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 6037 + }, + { + "epoch": 0.06038, + "grad_norm": 0.5606694744948917, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 6038 + }, + { + "epoch": 0.06039, + "grad_norm": 0.5636222880096219, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 6039 + }, + { + "epoch": 0.0604, + "grad_norm": 0.499349110228407, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 6040 + }, + { + "epoch": 0.06041, + "grad_norm": 0.4402271663603966, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 6041 + }, + { + "epoch": 0.06042, + "grad_norm": 0.5428448624210946, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 6042 + }, + { + "epoch": 0.06043, + "grad_norm": 0.6553234947154927, + "learning_rate": 0.003, + "loss": 4.1162, + "step": 6043 + }, + { + "epoch": 0.06044, + "grad_norm": 0.8600471908171244, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 6044 + }, + { + "epoch": 0.06045, + "grad_norm": 0.872706540571187, + "learning_rate": 0.003, + "loss": 4.1297, + "step": 6045 + }, + { + "epoch": 0.06046, + "grad_norm": 0.821018845139322, + "learning_rate": 0.003, + "loss": 4.1353, + "step": 6046 + }, + { + "epoch": 0.06047, + "grad_norm": 0.9233882633484479, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 6047 + }, + { + "epoch": 0.06048, + "grad_norm": 0.8442957052670749, + "learning_rate": 0.003, + "loss": 4.117, + "step": 6048 + }, + { + "epoch": 0.06049, + "grad_norm": 0.7302305536562084, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 6049 + }, + { + "epoch": 0.0605, + "grad_norm": 0.7221886690004308, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 6050 + }, + { + "epoch": 0.06051, + "grad_norm": 0.7360382510474752, + "learning_rate": 0.003, + "loss": 4.1511, + "step": 6051 + }, + { + "epoch": 0.06052, + "grad_norm": 0.8032114038858249, + "learning_rate": 0.003, + "loss": 4.1494, + "step": 6052 + }, + { + "epoch": 0.06053, + "grad_norm": 0.902493419850475, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 6053 + }, + { + "epoch": 0.06054, + "grad_norm": 0.806885544934829, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 6054 + }, + { + "epoch": 0.06055, + "grad_norm": 0.6869862769047108, + "learning_rate": 0.003, + "loss": 4.1325, + "step": 6055 + }, + { + "epoch": 0.06056, + "grad_norm": 0.6404057857514671, + "learning_rate": 0.003, + "loss": 4.1271, + "step": 6056 + }, + { + "epoch": 0.06057, + "grad_norm": 0.6290861083377466, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 6057 + }, + { + "epoch": 0.06058, + "grad_norm": 0.5444350086738762, + "learning_rate": 0.003, + "loss": 4.1188, + "step": 6058 + }, + { + "epoch": 0.06059, + "grad_norm": 0.5143246073617321, + "learning_rate": 0.003, + "loss": 4.13, + "step": 6059 + }, + { + "epoch": 0.0606, + "grad_norm": 0.5226193521387048, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 6060 + }, + { + "epoch": 0.06061, + "grad_norm": 0.5393493288965573, + "learning_rate": 0.003, + "loss": 4.1323, + "step": 6061 + }, + { + "epoch": 0.06062, + "grad_norm": 0.514502958864031, + "learning_rate": 0.003, + "loss": 4.1358, + "step": 6062 + }, + { + "epoch": 0.06063, + "grad_norm": 0.5094815444270113, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 6063 + }, + { + "epoch": 0.06064, + "grad_norm": 0.565356316744902, + "learning_rate": 0.003, + "loss": 4.1496, + "step": 6064 + }, + { + "epoch": 0.06065, + "grad_norm": 0.6136518813857853, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 6065 + }, + { + "epoch": 0.06066, + "grad_norm": 0.7283737698010649, + "learning_rate": 0.003, + "loss": 4.1534, + "step": 6066 + }, + { + "epoch": 0.06067, + "grad_norm": 0.8404355722803185, + "learning_rate": 0.003, + "loss": 4.1342, + "step": 6067 + }, + { + "epoch": 0.06068, + "grad_norm": 0.9467403298027881, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 6068 + }, + { + "epoch": 0.06069, + "grad_norm": 0.7197917961662677, + "learning_rate": 0.003, + "loss": 4.1299, + "step": 6069 + }, + { + "epoch": 0.0607, + "grad_norm": 0.5774704901538709, + "learning_rate": 0.003, + "loss": 4.1259, + "step": 6070 + }, + { + "epoch": 0.06071, + "grad_norm": 0.7311169673433349, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 6071 + }, + { + "epoch": 0.06072, + "grad_norm": 0.7458715679339405, + "learning_rate": 0.003, + "loss": 4.1131, + "step": 6072 + }, + { + "epoch": 0.06073, + "grad_norm": 0.6582937125157123, + "learning_rate": 0.003, + "loss": 4.1375, + "step": 6073 + }, + { + "epoch": 0.06074, + "grad_norm": 0.5828453881664645, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 6074 + }, + { + "epoch": 0.06075, + "grad_norm": 0.6029835041926357, + "learning_rate": 0.003, + "loss": 4.1402, + "step": 6075 + }, + { + "epoch": 0.06076, + "grad_norm": 0.6620595916061112, + "learning_rate": 0.003, + "loss": 4.1266, + "step": 6076 + }, + { + "epoch": 0.06077, + "grad_norm": 0.6564194015584814, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 6077 + }, + { + "epoch": 0.06078, + "grad_norm": 0.5186407499376411, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 6078 + }, + { + "epoch": 0.06079, + "grad_norm": 0.46056676475765035, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 6079 + }, + { + "epoch": 0.0608, + "grad_norm": 0.47846889118582303, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 6080 + }, + { + "epoch": 0.06081, + "grad_norm": 0.5041919175905508, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 6081 + }, + { + "epoch": 0.06082, + "grad_norm": 0.46581276372898556, + "learning_rate": 0.003, + "loss": 4.1401, + "step": 6082 + }, + { + "epoch": 0.06083, + "grad_norm": 0.44508131506221504, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 6083 + }, + { + "epoch": 0.06084, + "grad_norm": 0.4965899394145137, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 6084 + }, + { + "epoch": 0.06085, + "grad_norm": 0.54325827930552, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 6085 + }, + { + "epoch": 0.06086, + "grad_norm": 0.566296273115109, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 6086 + }, + { + "epoch": 0.06087, + "grad_norm": 0.6264732981988343, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 6087 + }, + { + "epoch": 0.06088, + "grad_norm": 0.7267394373677858, + "learning_rate": 0.003, + "loss": 4.1063, + "step": 6088 + }, + { + "epoch": 0.06089, + "grad_norm": 0.6910375877495948, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 6089 + }, + { + "epoch": 0.0609, + "grad_norm": 0.6230953516324953, + "learning_rate": 0.003, + "loss": 4.1421, + "step": 6090 + }, + { + "epoch": 0.06091, + "grad_norm": 0.6214986267493964, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 6091 + }, + { + "epoch": 0.06092, + "grad_norm": 0.5881606444052708, + "learning_rate": 0.003, + "loss": 4.096, + "step": 6092 + }, + { + "epoch": 0.06093, + "grad_norm": 0.6823668073031264, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 6093 + }, + { + "epoch": 0.06094, + "grad_norm": 0.8167174518561652, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 6094 + }, + { + "epoch": 0.06095, + "grad_norm": 0.7935802530659607, + "learning_rate": 0.003, + "loss": 4.126, + "step": 6095 + }, + { + "epoch": 0.06096, + "grad_norm": 0.8625378261886631, + "learning_rate": 0.003, + "loss": 4.1198, + "step": 6096 + }, + { + "epoch": 0.06097, + "grad_norm": 0.8936840972481503, + "learning_rate": 0.003, + "loss": 4.1399, + "step": 6097 + }, + { + "epoch": 0.06098, + "grad_norm": 0.9387438614632022, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 6098 + }, + { + "epoch": 0.06099, + "grad_norm": 0.929800973675657, + "learning_rate": 0.003, + "loss": 4.1427, + "step": 6099 + }, + { + "epoch": 0.061, + "grad_norm": 0.7917197466465429, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 6100 + }, + { + "epoch": 0.06101, + "grad_norm": 0.696545220226268, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 6101 + }, + { + "epoch": 0.06102, + "grad_norm": 0.6949648238824377, + "learning_rate": 0.003, + "loss": 4.1406, + "step": 6102 + }, + { + "epoch": 0.06103, + "grad_norm": 0.5850787846680839, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 6103 + }, + { + "epoch": 0.06104, + "grad_norm": 0.6577354660593795, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 6104 + }, + { + "epoch": 0.06105, + "grad_norm": 0.5544659377310093, + "learning_rate": 0.003, + "loss": 4.1189, + "step": 6105 + }, + { + "epoch": 0.06106, + "grad_norm": 0.597694523433053, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 6106 + }, + { + "epoch": 0.06107, + "grad_norm": 0.6874240958175389, + "learning_rate": 0.003, + "loss": 4.1541, + "step": 6107 + }, + { + "epoch": 0.06108, + "grad_norm": 0.7673053125459535, + "learning_rate": 0.003, + "loss": 4.1247, + "step": 6108 + }, + { + "epoch": 0.06109, + "grad_norm": 0.8657897732774222, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 6109 + }, + { + "epoch": 0.0611, + "grad_norm": 0.8160350383434505, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 6110 + }, + { + "epoch": 0.06111, + "grad_norm": 0.7059937113818745, + "learning_rate": 0.003, + "loss": 4.1379, + "step": 6111 + }, + { + "epoch": 0.06112, + "grad_norm": 0.6729024659212177, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 6112 + }, + { + "epoch": 0.06113, + "grad_norm": 0.6927676866964618, + "learning_rate": 0.003, + "loss": 4.1398, + "step": 6113 + }, + { + "epoch": 0.06114, + "grad_norm": 0.6588192987110121, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 6114 + }, + { + "epoch": 0.06115, + "grad_norm": 0.6984192814248, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 6115 + }, + { + "epoch": 0.06116, + "grad_norm": 0.6898828248533009, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 6116 + }, + { + "epoch": 0.06117, + "grad_norm": 0.6133649995686336, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 6117 + }, + { + "epoch": 0.06118, + "grad_norm": 0.5717021418949526, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 6118 + }, + { + "epoch": 0.06119, + "grad_norm": 0.5743733423784956, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 6119 + }, + { + "epoch": 0.0612, + "grad_norm": 0.6414842059318022, + "learning_rate": 0.003, + "loss": 4.1282, + "step": 6120 + }, + { + "epoch": 0.06121, + "grad_norm": 0.6294145948404021, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 6121 + }, + { + "epoch": 0.06122, + "grad_norm": 0.6183422089928163, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 6122 + }, + { + "epoch": 0.06123, + "grad_norm": 0.5371837518354552, + "learning_rate": 0.003, + "loss": 4.1223, + "step": 6123 + }, + { + "epoch": 0.06124, + "grad_norm": 0.5226763311000785, + "learning_rate": 0.003, + "loss": 4.113, + "step": 6124 + }, + { + "epoch": 0.06125, + "grad_norm": 0.42639897195355503, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 6125 + }, + { + "epoch": 0.06126, + "grad_norm": 0.3629196367260766, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 6126 + }, + { + "epoch": 0.06127, + "grad_norm": 0.368265832283412, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 6127 + }, + { + "epoch": 0.06128, + "grad_norm": 0.38181801243432145, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 6128 + }, + { + "epoch": 0.06129, + "grad_norm": 0.40095129455194956, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 6129 + }, + { + "epoch": 0.0613, + "grad_norm": 0.4499458135634559, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 6130 + }, + { + "epoch": 0.06131, + "grad_norm": 0.5593938292242131, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 6131 + }, + { + "epoch": 0.06132, + "grad_norm": 0.68971097830543, + "learning_rate": 0.003, + "loss": 4.1169, + "step": 6132 + }, + { + "epoch": 0.06133, + "grad_norm": 0.7970564003592729, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 6133 + }, + { + "epoch": 0.06134, + "grad_norm": 0.7946308123903568, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 6134 + }, + { + "epoch": 0.06135, + "grad_norm": 0.7243365229476787, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 6135 + }, + { + "epoch": 0.06136, + "grad_norm": 0.7248377814506896, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 6136 + }, + { + "epoch": 0.06137, + "grad_norm": 0.689387348680236, + "learning_rate": 0.003, + "loss": 4.1119, + "step": 6137 + }, + { + "epoch": 0.06138, + "grad_norm": 0.6434616078916127, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 6138 + }, + { + "epoch": 0.06139, + "grad_norm": 0.6388715462619938, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 6139 + }, + { + "epoch": 0.0614, + "grad_norm": 0.5543186971635615, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 6140 + }, + { + "epoch": 0.06141, + "grad_norm": 0.5449818716272995, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 6141 + }, + { + "epoch": 0.06142, + "grad_norm": 0.5906892436672208, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 6142 + }, + { + "epoch": 0.06143, + "grad_norm": 0.6241199656978966, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 6143 + }, + { + "epoch": 0.06144, + "grad_norm": 0.691568154043788, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 6144 + }, + { + "epoch": 0.06145, + "grad_norm": 0.800383551611604, + "learning_rate": 0.003, + "loss": 4.1306, + "step": 6145 + }, + { + "epoch": 0.06146, + "grad_norm": 0.8240824904436047, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 6146 + }, + { + "epoch": 0.06147, + "grad_norm": 0.8394359495307258, + "learning_rate": 0.003, + "loss": 4.1555, + "step": 6147 + }, + { + "epoch": 0.06148, + "grad_norm": 0.8704187123192046, + "learning_rate": 0.003, + "loss": 4.1313, + "step": 6148 + }, + { + "epoch": 0.06149, + "grad_norm": 0.8286112000168357, + "learning_rate": 0.003, + "loss": 4.123, + "step": 6149 + }, + { + "epoch": 0.0615, + "grad_norm": 0.7291708358830001, + "learning_rate": 0.003, + "loss": 4.1369, + "step": 6150 + }, + { + "epoch": 0.06151, + "grad_norm": 0.6713497435304215, + "learning_rate": 0.003, + "loss": 4.1292, + "step": 6151 + }, + { + "epoch": 0.06152, + "grad_norm": 0.6781308025527774, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 6152 + }, + { + "epoch": 0.06153, + "grad_norm": 0.652938479870769, + "learning_rate": 0.003, + "loss": 4.1395, + "step": 6153 + }, + { + "epoch": 0.06154, + "grad_norm": 0.6842620061629228, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 6154 + }, + { + "epoch": 0.06155, + "grad_norm": 0.6696388139082615, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 6155 + }, + { + "epoch": 0.06156, + "grad_norm": 0.661871708727772, + "learning_rate": 0.003, + "loss": 4.1161, + "step": 6156 + }, + { + "epoch": 0.06157, + "grad_norm": 0.6706443949736604, + "learning_rate": 0.003, + "loss": 4.1333, + "step": 6157 + }, + { + "epoch": 0.06158, + "grad_norm": 0.6730091441682898, + "learning_rate": 0.003, + "loss": 4.1581, + "step": 6158 + }, + { + "epoch": 0.06159, + "grad_norm": 0.6516318216884822, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 6159 + }, + { + "epoch": 0.0616, + "grad_norm": 0.6776433090637509, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 6160 + }, + { + "epoch": 0.06161, + "grad_norm": 0.705245630542708, + "learning_rate": 0.003, + "loss": 4.1387, + "step": 6161 + }, + { + "epoch": 0.06162, + "grad_norm": 0.667033646386603, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 6162 + }, + { + "epoch": 0.06163, + "grad_norm": 0.6115409987155483, + "learning_rate": 0.003, + "loss": 4.1525, + "step": 6163 + }, + { + "epoch": 0.06164, + "grad_norm": 0.6022989592102578, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 6164 + }, + { + "epoch": 0.06165, + "grad_norm": 0.4995381931517796, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 6165 + }, + { + "epoch": 0.06166, + "grad_norm": 0.5481265388426909, + "learning_rate": 0.003, + "loss": 4.1362, + "step": 6166 + }, + { + "epoch": 0.06167, + "grad_norm": 0.5609608519278377, + "learning_rate": 0.003, + "loss": 4.107, + "step": 6167 + }, + { + "epoch": 0.06168, + "grad_norm": 0.5825751863249862, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 6168 + }, + { + "epoch": 0.06169, + "grad_norm": 0.7711393078072618, + "learning_rate": 0.003, + "loss": 4.1164, + "step": 6169 + }, + { + "epoch": 0.0617, + "grad_norm": 1.0686022680290213, + "learning_rate": 0.003, + "loss": 4.1327, + "step": 6170 + }, + { + "epoch": 0.06171, + "grad_norm": 0.8988455270603705, + "learning_rate": 0.003, + "loss": 4.1401, + "step": 6171 + }, + { + "epoch": 0.06172, + "grad_norm": 0.668550526380113, + "learning_rate": 0.003, + "loss": 4.1053, + "step": 6172 + }, + { + "epoch": 0.06173, + "grad_norm": 0.6432161964405613, + "learning_rate": 0.003, + "loss": 4.1344, + "step": 6173 + }, + { + "epoch": 0.06174, + "grad_norm": 0.6965904305758828, + "learning_rate": 0.003, + "loss": 4.1298, + "step": 6174 + }, + { + "epoch": 0.06175, + "grad_norm": 0.6884542211695148, + "learning_rate": 0.003, + "loss": 4.1298, + "step": 6175 + }, + { + "epoch": 0.06176, + "grad_norm": 0.722156592407858, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 6176 + }, + { + "epoch": 0.06177, + "grad_norm": 0.7406001109119403, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 6177 + }, + { + "epoch": 0.06178, + "grad_norm": 0.7298011712863863, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 6178 + }, + { + "epoch": 0.06179, + "grad_norm": 0.7122823712875358, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 6179 + }, + { + "epoch": 0.0618, + "grad_norm": 0.5964322698799788, + "learning_rate": 0.003, + "loss": 4.123, + "step": 6180 + }, + { + "epoch": 0.06181, + "grad_norm": 0.581223696651277, + "learning_rate": 0.003, + "loss": 4.1402, + "step": 6181 + }, + { + "epoch": 0.06182, + "grad_norm": 0.6517698358193322, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 6182 + }, + { + "epoch": 0.06183, + "grad_norm": 0.82973155998321, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 6183 + }, + { + "epoch": 0.06184, + "grad_norm": 1.0291494249526294, + "learning_rate": 0.003, + "loss": 4.1363, + "step": 6184 + }, + { + "epoch": 0.06185, + "grad_norm": 0.838098174913646, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 6185 + }, + { + "epoch": 0.06186, + "grad_norm": 0.7438011498693125, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 6186 + }, + { + "epoch": 0.06187, + "grad_norm": 0.6718856898807979, + "learning_rate": 0.003, + "loss": 4.149, + "step": 6187 + }, + { + "epoch": 0.06188, + "grad_norm": 0.5340636370837376, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 6188 + }, + { + "epoch": 0.06189, + "grad_norm": 0.5159651984761162, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 6189 + }, + { + "epoch": 0.0619, + "grad_norm": 0.4916863941661442, + "learning_rate": 0.003, + "loss": 4.1512, + "step": 6190 + }, + { + "epoch": 0.06191, + "grad_norm": 0.4762063974718235, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 6191 + }, + { + "epoch": 0.06192, + "grad_norm": 0.46078596463013155, + "learning_rate": 0.003, + "loss": 4.103, + "step": 6192 + }, + { + "epoch": 0.06193, + "grad_norm": 0.45789803674311924, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 6193 + }, + { + "epoch": 0.06194, + "grad_norm": 0.3890805745658892, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 6194 + }, + { + "epoch": 0.06195, + "grad_norm": 0.39301414143303115, + "learning_rate": 0.003, + "loss": 4.1095, + "step": 6195 + }, + { + "epoch": 0.06196, + "grad_norm": 0.4015529007182125, + "learning_rate": 0.003, + "loss": 4.1204, + "step": 6196 + }, + { + "epoch": 0.06197, + "grad_norm": 0.4155210473439802, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 6197 + }, + { + "epoch": 0.06198, + "grad_norm": 0.4460266614970883, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 6198 + }, + { + "epoch": 0.06199, + "grad_norm": 0.4771646815553033, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 6199 + }, + { + "epoch": 0.062, + "grad_norm": 0.5196589692573884, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 6200 + }, + { + "epoch": 0.06201, + "grad_norm": 0.654622055901127, + "learning_rate": 0.003, + "loss": 4.097, + "step": 6201 + }, + { + "epoch": 0.06202, + "grad_norm": 0.7996732523950479, + "learning_rate": 0.003, + "loss": 4.1475, + "step": 6202 + }, + { + "epoch": 0.06203, + "grad_norm": 0.9238177375399548, + "learning_rate": 0.003, + "loss": 4.1397, + "step": 6203 + }, + { + "epoch": 0.06204, + "grad_norm": 0.8129041862310228, + "learning_rate": 0.003, + "loss": 4.131, + "step": 6204 + }, + { + "epoch": 0.06205, + "grad_norm": 0.6014391143548676, + "learning_rate": 0.003, + "loss": 4.1265, + "step": 6205 + }, + { + "epoch": 0.06206, + "grad_norm": 0.6512606966939022, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 6206 + }, + { + "epoch": 0.06207, + "grad_norm": 0.6313513041064389, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 6207 + }, + { + "epoch": 0.06208, + "grad_norm": 0.5802624432054514, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 6208 + }, + { + "epoch": 0.06209, + "grad_norm": 0.5117728936425465, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 6209 + }, + { + "epoch": 0.0621, + "grad_norm": 0.5061612945373342, + "learning_rate": 0.003, + "loss": 4.15, + "step": 6210 + }, + { + "epoch": 0.06211, + "grad_norm": 0.5713900960332042, + "learning_rate": 0.003, + "loss": 4.1147, + "step": 6211 + }, + { + "epoch": 0.06212, + "grad_norm": 0.6069757619115741, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 6212 + }, + { + "epoch": 0.06213, + "grad_norm": 0.7325742487754486, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 6213 + }, + { + "epoch": 0.06214, + "grad_norm": 0.9327829534552855, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 6214 + }, + { + "epoch": 0.06215, + "grad_norm": 1.035143571261494, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 6215 + }, + { + "epoch": 0.06216, + "grad_norm": 0.9038892809710738, + "learning_rate": 0.003, + "loss": 4.1712, + "step": 6216 + }, + { + "epoch": 0.06217, + "grad_norm": 0.8175109198583256, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 6217 + }, + { + "epoch": 0.06218, + "grad_norm": 0.8799511997661614, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 6218 + }, + { + "epoch": 0.06219, + "grad_norm": 0.9186492040029288, + "learning_rate": 0.003, + "loss": 4.1569, + "step": 6219 + }, + { + "epoch": 0.0622, + "grad_norm": 0.904845342853041, + "learning_rate": 0.003, + "loss": 4.1554, + "step": 6220 + }, + { + "epoch": 0.06221, + "grad_norm": 0.8348080339462427, + "learning_rate": 0.003, + "loss": 4.1528, + "step": 6221 + }, + { + "epoch": 0.06222, + "grad_norm": 0.7823721322877822, + "learning_rate": 0.003, + "loss": 4.1371, + "step": 6222 + }, + { + "epoch": 0.06223, + "grad_norm": 0.8355859013287794, + "learning_rate": 0.003, + "loss": 4.1484, + "step": 6223 + }, + { + "epoch": 0.06224, + "grad_norm": 0.9715063855754449, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 6224 + }, + { + "epoch": 0.06225, + "grad_norm": 0.7808199034581652, + "learning_rate": 0.003, + "loss": 4.1509, + "step": 6225 + }, + { + "epoch": 0.06226, + "grad_norm": 0.5329204522038972, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 6226 + }, + { + "epoch": 0.06227, + "grad_norm": 0.5450265158322551, + "learning_rate": 0.003, + "loss": 4.1314, + "step": 6227 + }, + { + "epoch": 0.06228, + "grad_norm": 0.5351380607743084, + "learning_rate": 0.003, + "loss": 4.1347, + "step": 6228 + }, + { + "epoch": 0.06229, + "grad_norm": 0.4599058234618347, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 6229 + }, + { + "epoch": 0.0623, + "grad_norm": 0.4873390679451465, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 6230 + }, + { + "epoch": 0.06231, + "grad_norm": 0.5391956252106268, + "learning_rate": 0.003, + "loss": 4.1151, + "step": 6231 + }, + { + "epoch": 0.06232, + "grad_norm": 0.5342591833135322, + "learning_rate": 0.003, + "loss": 4.11, + "step": 6232 + }, + { + "epoch": 0.06233, + "grad_norm": 0.5191689316659253, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 6233 + }, + { + "epoch": 0.06234, + "grad_norm": 0.5396807325147936, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 6234 + }, + { + "epoch": 0.06235, + "grad_norm": 0.5913027145203329, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 6235 + }, + { + "epoch": 0.06236, + "grad_norm": 0.5979504829726415, + "learning_rate": 0.003, + "loss": 4.126, + "step": 6236 + }, + { + "epoch": 0.06237, + "grad_norm": 0.5052384980096988, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 6237 + }, + { + "epoch": 0.06238, + "grad_norm": 0.6128825413246882, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 6238 + }, + { + "epoch": 0.06239, + "grad_norm": 0.7235535837216264, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 6239 + }, + { + "epoch": 0.0624, + "grad_norm": 0.7240803076746741, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 6240 + }, + { + "epoch": 0.06241, + "grad_norm": 0.6862973733458866, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 6241 + }, + { + "epoch": 0.06242, + "grad_norm": 0.7830627984578179, + "learning_rate": 0.003, + "loss": 4.1254, + "step": 6242 + }, + { + "epoch": 0.06243, + "grad_norm": 0.7591018516451389, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 6243 + }, + { + "epoch": 0.06244, + "grad_norm": 0.7952190700820353, + "learning_rate": 0.003, + "loss": 4.1517, + "step": 6244 + }, + { + "epoch": 0.06245, + "grad_norm": 0.6835256460654824, + "learning_rate": 0.003, + "loss": 4.1234, + "step": 6245 + }, + { + "epoch": 0.06246, + "grad_norm": 0.6543359618448353, + "learning_rate": 0.003, + "loss": 4.1372, + "step": 6246 + }, + { + "epoch": 0.06247, + "grad_norm": 0.5419452686920828, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 6247 + }, + { + "epoch": 0.06248, + "grad_norm": 0.4806235291826554, + "learning_rate": 0.003, + "loss": 4.114, + "step": 6248 + }, + { + "epoch": 0.06249, + "grad_norm": 0.453910776396139, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 6249 + }, + { + "epoch": 0.0625, + "grad_norm": 0.491783945365717, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 6250 + }, + { + "epoch": 0.06251, + "grad_norm": 0.5018941564042159, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 6251 + }, + { + "epoch": 0.06252, + "grad_norm": 0.4921472089644119, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 6252 + }, + { + "epoch": 0.06253, + "grad_norm": 0.5756543753609307, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 6253 + }, + { + "epoch": 0.06254, + "grad_norm": 0.754887950867572, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 6254 + }, + { + "epoch": 0.06255, + "grad_norm": 0.8752296029209932, + "learning_rate": 0.003, + "loss": 4.1282, + "step": 6255 + }, + { + "epoch": 0.06256, + "grad_norm": 0.9844123215090611, + "learning_rate": 0.003, + "loss": 4.1095, + "step": 6256 + }, + { + "epoch": 0.06257, + "grad_norm": 0.8214120124061037, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 6257 + }, + { + "epoch": 0.06258, + "grad_norm": 0.7602595490679227, + "learning_rate": 0.003, + "loss": 4.141, + "step": 6258 + }, + { + "epoch": 0.06259, + "grad_norm": 0.9107431312102489, + "learning_rate": 0.003, + "loss": 4.1605, + "step": 6259 + }, + { + "epoch": 0.0626, + "grad_norm": 0.8325776100808221, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 6260 + }, + { + "epoch": 0.06261, + "grad_norm": 0.7571831266663098, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 6261 + }, + { + "epoch": 0.06262, + "grad_norm": 0.8785267861570065, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 6262 + }, + { + "epoch": 0.06263, + "grad_norm": 0.8761411863659299, + "learning_rate": 0.003, + "loss": 4.1576, + "step": 6263 + }, + { + "epoch": 0.06264, + "grad_norm": 0.7407405897110528, + "learning_rate": 0.003, + "loss": 4.1281, + "step": 6264 + }, + { + "epoch": 0.06265, + "grad_norm": 0.5999131309688215, + "learning_rate": 0.003, + "loss": 4.1401, + "step": 6265 + }, + { + "epoch": 0.06266, + "grad_norm": 0.5788028949497352, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 6266 + }, + { + "epoch": 0.06267, + "grad_norm": 0.5413831011099959, + "learning_rate": 0.003, + "loss": 4.127, + "step": 6267 + }, + { + "epoch": 0.06268, + "grad_norm": 0.5311326877936203, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 6268 + }, + { + "epoch": 0.06269, + "grad_norm": 0.46005639814155846, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 6269 + }, + { + "epoch": 0.0627, + "grad_norm": 0.4377136995334488, + "learning_rate": 0.003, + "loss": 4.1122, + "step": 6270 + }, + { + "epoch": 0.06271, + "grad_norm": 0.45104310998898184, + "learning_rate": 0.003, + "loss": 4.1253, + "step": 6271 + }, + { + "epoch": 0.06272, + "grad_norm": 0.5739468267833442, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 6272 + }, + { + "epoch": 0.06273, + "grad_norm": 0.6519100999767281, + "learning_rate": 0.003, + "loss": 4.1378, + "step": 6273 + }, + { + "epoch": 0.06274, + "grad_norm": 0.7586632185001425, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 6274 + }, + { + "epoch": 0.06275, + "grad_norm": 0.8032215370564139, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 6275 + }, + { + "epoch": 0.06276, + "grad_norm": 0.808677786213581, + "learning_rate": 0.003, + "loss": 4.1316, + "step": 6276 + }, + { + "epoch": 0.06277, + "grad_norm": 0.7705443100288017, + "learning_rate": 0.003, + "loss": 4.1371, + "step": 6277 + }, + { + "epoch": 0.06278, + "grad_norm": 0.6723307868789453, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 6278 + }, + { + "epoch": 0.06279, + "grad_norm": 0.650139228463092, + "learning_rate": 0.003, + "loss": 4.1425, + "step": 6279 + }, + { + "epoch": 0.0628, + "grad_norm": 0.6941961358979846, + "learning_rate": 0.003, + "loss": 4.1138, + "step": 6280 + }, + { + "epoch": 0.06281, + "grad_norm": 0.6921836377836419, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 6281 + }, + { + "epoch": 0.06282, + "grad_norm": 0.7247534756382068, + "learning_rate": 0.003, + "loss": 4.1165, + "step": 6282 + }, + { + "epoch": 0.06283, + "grad_norm": 0.698538662441215, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 6283 + }, + { + "epoch": 0.06284, + "grad_norm": 0.6478092027016852, + "learning_rate": 0.003, + "loss": 4.1211, + "step": 6284 + }, + { + "epoch": 0.06285, + "grad_norm": 0.588832262527032, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 6285 + }, + { + "epoch": 0.06286, + "grad_norm": 0.471220817590307, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 6286 + }, + { + "epoch": 0.06287, + "grad_norm": 0.5253206409382251, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 6287 + }, + { + "epoch": 0.06288, + "grad_norm": 0.5700598976368867, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 6288 + }, + { + "epoch": 0.06289, + "grad_norm": 0.5480401509047442, + "learning_rate": 0.003, + "loss": 4.078, + "step": 6289 + }, + { + "epoch": 0.0629, + "grad_norm": 0.45176823851380066, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 6290 + }, + { + "epoch": 0.06291, + "grad_norm": 0.42244036366221777, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 6291 + }, + { + "epoch": 0.06292, + "grad_norm": 0.4757895925155192, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 6292 + }, + { + "epoch": 0.06293, + "grad_norm": 0.46687329270252953, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 6293 + }, + { + "epoch": 0.06294, + "grad_norm": 0.4692166334566838, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 6294 + }, + { + "epoch": 0.06295, + "grad_norm": 0.4964943486782987, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 6295 + }, + { + "epoch": 0.06296, + "grad_norm": 0.6023566234041191, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 6296 + }, + { + "epoch": 0.06297, + "grad_norm": 0.6562138349481754, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 6297 + }, + { + "epoch": 0.06298, + "grad_norm": 0.7252858308308101, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 6298 + }, + { + "epoch": 0.06299, + "grad_norm": 0.7441518369961619, + "learning_rate": 0.003, + "loss": 4.109, + "step": 6299 + }, + { + "epoch": 0.063, + "grad_norm": 0.643909205073487, + "learning_rate": 0.003, + "loss": 4.1558, + "step": 6300 + }, + { + "epoch": 0.06301, + "grad_norm": 0.7076717275211952, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 6301 + }, + { + "epoch": 0.06302, + "grad_norm": 0.8610584882505145, + "learning_rate": 0.003, + "loss": 4.1298, + "step": 6302 + }, + { + "epoch": 0.06303, + "grad_norm": 0.9134770715261514, + "learning_rate": 0.003, + "loss": 4.1331, + "step": 6303 + }, + { + "epoch": 0.06304, + "grad_norm": 0.8308795580072063, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 6304 + }, + { + "epoch": 0.06305, + "grad_norm": 0.7703883250041841, + "learning_rate": 0.003, + "loss": 4.1556, + "step": 6305 + }, + { + "epoch": 0.06306, + "grad_norm": 0.8074367440959146, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 6306 + }, + { + "epoch": 0.06307, + "grad_norm": 0.7821961103686115, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 6307 + }, + { + "epoch": 0.06308, + "grad_norm": 0.793746924122557, + "learning_rate": 0.003, + "loss": 4.1339, + "step": 6308 + }, + { + "epoch": 0.06309, + "grad_norm": 0.8187320878830382, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 6309 + }, + { + "epoch": 0.0631, + "grad_norm": 0.8140608370958674, + "learning_rate": 0.003, + "loss": 4.1292, + "step": 6310 + }, + { + "epoch": 0.06311, + "grad_norm": 0.7925215372022332, + "learning_rate": 0.003, + "loss": 4.1223, + "step": 6311 + }, + { + "epoch": 0.06312, + "grad_norm": 0.6948175841091384, + "learning_rate": 0.003, + "loss": 4.1132, + "step": 6312 + }, + { + "epoch": 0.06313, + "grad_norm": 0.6098450127577909, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 6313 + }, + { + "epoch": 0.06314, + "grad_norm": 0.601788808023616, + "learning_rate": 0.003, + "loss": 4.1378, + "step": 6314 + }, + { + "epoch": 0.06315, + "grad_norm": 0.7002256479736269, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 6315 + }, + { + "epoch": 0.06316, + "grad_norm": 0.6702801162745691, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 6316 + }, + { + "epoch": 0.06317, + "grad_norm": 0.6767392466613616, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 6317 + }, + { + "epoch": 0.06318, + "grad_norm": 0.7396695999029177, + "learning_rate": 0.003, + "loss": 4.1577, + "step": 6318 + }, + { + "epoch": 0.06319, + "grad_norm": 0.7873666098443218, + "learning_rate": 0.003, + "loss": 4.1632, + "step": 6319 + }, + { + "epoch": 0.0632, + "grad_norm": 0.7968696734464397, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 6320 + }, + { + "epoch": 0.06321, + "grad_norm": 0.748485567878394, + "learning_rate": 0.003, + "loss": 4.1373, + "step": 6321 + }, + { + "epoch": 0.06322, + "grad_norm": 0.700562547460408, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 6322 + }, + { + "epoch": 0.06323, + "grad_norm": 0.5674479127547497, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 6323 + }, + { + "epoch": 0.06324, + "grad_norm": 0.5979201027650989, + "learning_rate": 0.003, + "loss": 4.129, + "step": 6324 + }, + { + "epoch": 0.06325, + "grad_norm": 0.7374786634922282, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 6325 + }, + { + "epoch": 0.06326, + "grad_norm": 0.700878890093403, + "learning_rate": 0.003, + "loss": 4.1281, + "step": 6326 + }, + { + "epoch": 0.06327, + "grad_norm": 0.6414815261782608, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 6327 + }, + { + "epoch": 0.06328, + "grad_norm": 0.5746442825430522, + "learning_rate": 0.003, + "loss": 4.1421, + "step": 6328 + }, + { + "epoch": 0.06329, + "grad_norm": 0.4936256424209013, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 6329 + }, + { + "epoch": 0.0633, + "grad_norm": 0.5247583876058833, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 6330 + }, + { + "epoch": 0.06331, + "grad_norm": 0.5330239575249108, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 6331 + }, + { + "epoch": 0.06332, + "grad_norm": 0.5813651225344605, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 6332 + }, + { + "epoch": 0.06333, + "grad_norm": 0.597763757106397, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 6333 + }, + { + "epoch": 0.06334, + "grad_norm": 0.6306087415380968, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 6334 + }, + { + "epoch": 0.06335, + "grad_norm": 0.7275752434891477, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 6335 + }, + { + "epoch": 0.06336, + "grad_norm": 0.7536067920651794, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 6336 + }, + { + "epoch": 0.06337, + "grad_norm": 0.7434090606482405, + "learning_rate": 0.003, + "loss": 4.1434, + "step": 6337 + }, + { + "epoch": 0.06338, + "grad_norm": 0.7917061963285527, + "learning_rate": 0.003, + "loss": 4.1306, + "step": 6338 + }, + { + "epoch": 0.06339, + "grad_norm": 0.7963119547355659, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 6339 + }, + { + "epoch": 0.0634, + "grad_norm": 0.7009711549087663, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 6340 + }, + { + "epoch": 0.06341, + "grad_norm": 0.560288845949728, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 6341 + }, + { + "epoch": 0.06342, + "grad_norm": 0.6129201020379068, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 6342 + }, + { + "epoch": 0.06343, + "grad_norm": 0.6289890308043102, + "learning_rate": 0.003, + "loss": 4.1299, + "step": 6343 + }, + { + "epoch": 0.06344, + "grad_norm": 0.5883345878311979, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 6344 + }, + { + "epoch": 0.06345, + "grad_norm": 0.6114359553520017, + "learning_rate": 0.003, + "loss": 4.105, + "step": 6345 + }, + { + "epoch": 0.06346, + "grad_norm": 0.6130452330266231, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 6346 + }, + { + "epoch": 0.06347, + "grad_norm": 0.5609868656725044, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 6347 + }, + { + "epoch": 0.06348, + "grad_norm": 0.5111538310088432, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 6348 + }, + { + "epoch": 0.06349, + "grad_norm": 0.47960134551277134, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 6349 + }, + { + "epoch": 0.0635, + "grad_norm": 0.47528221979889074, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 6350 + }, + { + "epoch": 0.06351, + "grad_norm": 0.44208488045258604, + "learning_rate": 0.003, + "loss": 4.1153, + "step": 6351 + }, + { + "epoch": 0.06352, + "grad_norm": 0.5710585928560157, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 6352 + }, + { + "epoch": 0.06353, + "grad_norm": 0.8023230715536823, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 6353 + }, + { + "epoch": 0.06354, + "grad_norm": 1.0817933740124142, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 6354 + }, + { + "epoch": 0.06355, + "grad_norm": 0.839081793625545, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 6355 + }, + { + "epoch": 0.06356, + "grad_norm": 0.6498473408279323, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 6356 + }, + { + "epoch": 0.06357, + "grad_norm": 0.8657856383717981, + "learning_rate": 0.003, + "loss": 4.1443, + "step": 6357 + }, + { + "epoch": 0.06358, + "grad_norm": 0.7367851380124152, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 6358 + }, + { + "epoch": 0.06359, + "grad_norm": 0.5580748213290369, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 6359 + }, + { + "epoch": 0.0636, + "grad_norm": 0.5715351960251037, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 6360 + }, + { + "epoch": 0.06361, + "grad_norm": 0.565402082000186, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 6361 + }, + { + "epoch": 0.06362, + "grad_norm": 0.6032607082774909, + "learning_rate": 0.003, + "loss": 4.1331, + "step": 6362 + }, + { + "epoch": 0.06363, + "grad_norm": 0.5994033595373762, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 6363 + }, + { + "epoch": 0.06364, + "grad_norm": 0.5608720225119531, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 6364 + }, + { + "epoch": 0.06365, + "grad_norm": 0.5416673020756672, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 6365 + }, + { + "epoch": 0.06366, + "grad_norm": 0.5508838364908115, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 6366 + }, + { + "epoch": 0.06367, + "grad_norm": 0.6095798041890194, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 6367 + }, + { + "epoch": 0.06368, + "grad_norm": 0.582397991543009, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 6368 + }, + { + "epoch": 0.06369, + "grad_norm": 0.6111604358018735, + "learning_rate": 0.003, + "loss": 4.145, + "step": 6369 + }, + { + "epoch": 0.0637, + "grad_norm": 0.6887636290273739, + "learning_rate": 0.003, + "loss": 4.108, + "step": 6370 + }, + { + "epoch": 0.06371, + "grad_norm": 0.7312125926181495, + "learning_rate": 0.003, + "loss": 4.1314, + "step": 6371 + }, + { + "epoch": 0.06372, + "grad_norm": 0.7404666495620588, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 6372 + }, + { + "epoch": 0.06373, + "grad_norm": 0.7812823153466856, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 6373 + }, + { + "epoch": 0.06374, + "grad_norm": 0.7678179937935664, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 6374 + }, + { + "epoch": 0.06375, + "grad_norm": 0.7856368742093315, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 6375 + }, + { + "epoch": 0.06376, + "grad_norm": 0.7933141620096883, + "learning_rate": 0.003, + "loss": 4.1244, + "step": 6376 + }, + { + "epoch": 0.06377, + "grad_norm": 0.7911553024083666, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 6377 + }, + { + "epoch": 0.06378, + "grad_norm": 0.7097351170207112, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 6378 + }, + { + "epoch": 0.06379, + "grad_norm": 0.6022623562253553, + "learning_rate": 0.003, + "loss": 4.1282, + "step": 6379 + }, + { + "epoch": 0.0638, + "grad_norm": 0.6161727993530954, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 6380 + }, + { + "epoch": 0.06381, + "grad_norm": 0.5896296208532763, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 6381 + }, + { + "epoch": 0.06382, + "grad_norm": 0.5666427713210422, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 6382 + }, + { + "epoch": 0.06383, + "grad_norm": 0.5280695299014706, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 6383 + }, + { + "epoch": 0.06384, + "grad_norm": 0.5315287442798651, + "learning_rate": 0.003, + "loss": 4.1222, + "step": 6384 + }, + { + "epoch": 0.06385, + "grad_norm": 0.6180327194782872, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 6385 + }, + { + "epoch": 0.06386, + "grad_norm": 0.6877124042376932, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 6386 + }, + { + "epoch": 0.06387, + "grad_norm": 0.7049546795784516, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 6387 + }, + { + "epoch": 0.06388, + "grad_norm": 0.7560102533270469, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 6388 + }, + { + "epoch": 0.06389, + "grad_norm": 0.7727138622823404, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 6389 + }, + { + "epoch": 0.0639, + "grad_norm": 0.6854773910756959, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 6390 + }, + { + "epoch": 0.06391, + "grad_norm": 0.6116296991895176, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 6391 + }, + { + "epoch": 0.06392, + "grad_norm": 0.7278138368053769, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 6392 + }, + { + "epoch": 0.06393, + "grad_norm": 0.8102481077528341, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 6393 + }, + { + "epoch": 0.06394, + "grad_norm": 0.9128273035282637, + "learning_rate": 0.003, + "loss": 4.1237, + "step": 6394 + }, + { + "epoch": 0.06395, + "grad_norm": 0.8577312836930895, + "learning_rate": 0.003, + "loss": 4.1468, + "step": 6395 + }, + { + "epoch": 0.06396, + "grad_norm": 0.7745179973456292, + "learning_rate": 0.003, + "loss": 4.1578, + "step": 6396 + }, + { + "epoch": 0.06397, + "grad_norm": 0.7881218719479312, + "learning_rate": 0.003, + "loss": 4.1151, + "step": 6397 + }, + { + "epoch": 0.06398, + "grad_norm": 0.7024891362838419, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 6398 + }, + { + "epoch": 0.06399, + "grad_norm": 0.7110448250319877, + "learning_rate": 0.003, + "loss": 4.1321, + "step": 6399 + }, + { + "epoch": 0.064, + "grad_norm": 0.7111684482738276, + "learning_rate": 0.003, + "loss": 4.1229, + "step": 6400 + }, + { + "epoch": 0.06401, + "grad_norm": 0.693597181970454, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 6401 + }, + { + "epoch": 0.06402, + "grad_norm": 0.7922387757761263, + "learning_rate": 0.003, + "loss": 4.115, + "step": 6402 + }, + { + "epoch": 0.06403, + "grad_norm": 0.8260320747984292, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 6403 + }, + { + "epoch": 0.06404, + "grad_norm": 0.729663274761036, + "learning_rate": 0.003, + "loss": 4.1675, + "step": 6404 + }, + { + "epoch": 0.06405, + "grad_norm": 0.7378732939090409, + "learning_rate": 0.003, + "loss": 4.1292, + "step": 6405 + }, + { + "epoch": 0.06406, + "grad_norm": 0.6383629571059021, + "learning_rate": 0.003, + "loss": 4.112, + "step": 6406 + }, + { + "epoch": 0.06407, + "grad_norm": 0.6050162196732369, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 6407 + }, + { + "epoch": 0.06408, + "grad_norm": 0.5983568485448366, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 6408 + }, + { + "epoch": 0.06409, + "grad_norm": 0.5642818531961227, + "learning_rate": 0.003, + "loss": 4.1332, + "step": 6409 + }, + { + "epoch": 0.0641, + "grad_norm": 0.5874430801692443, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 6410 + }, + { + "epoch": 0.06411, + "grad_norm": 0.6330649262504477, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 6411 + }, + { + "epoch": 0.06412, + "grad_norm": 0.6618713014028005, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 6412 + }, + { + "epoch": 0.06413, + "grad_norm": 0.700072236260971, + "learning_rate": 0.003, + "loss": 4.1443, + "step": 6413 + }, + { + "epoch": 0.06414, + "grad_norm": 0.6353084694205889, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 6414 + }, + { + "epoch": 0.06415, + "grad_norm": 0.5831173982008553, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 6415 + }, + { + "epoch": 0.06416, + "grad_norm": 0.5437779001226543, + "learning_rate": 0.003, + "loss": 4.1331, + "step": 6416 + }, + { + "epoch": 0.06417, + "grad_norm": 0.6107145316832859, + "learning_rate": 0.003, + "loss": 4.1055, + "step": 6417 + }, + { + "epoch": 0.06418, + "grad_norm": 0.6005928402730104, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 6418 + }, + { + "epoch": 0.06419, + "grad_norm": 0.583926884989267, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 6419 + }, + { + "epoch": 0.0642, + "grad_norm": 0.6742080868290792, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 6420 + }, + { + "epoch": 0.06421, + "grad_norm": 0.7855355256548284, + "learning_rate": 0.003, + "loss": 4.1294, + "step": 6421 + }, + { + "epoch": 0.06422, + "grad_norm": 0.9319136699441484, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 6422 + }, + { + "epoch": 0.06423, + "grad_norm": 0.999583894911011, + "learning_rate": 0.003, + "loss": 4.1451, + "step": 6423 + }, + { + "epoch": 0.06424, + "grad_norm": 0.9331342374240993, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 6424 + }, + { + "epoch": 0.06425, + "grad_norm": 0.7793159824304509, + "learning_rate": 0.003, + "loss": 4.1284, + "step": 6425 + }, + { + "epoch": 0.06426, + "grad_norm": 0.6962948866885783, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 6426 + }, + { + "epoch": 0.06427, + "grad_norm": 0.683410446043668, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 6427 + }, + { + "epoch": 0.06428, + "grad_norm": 0.6512307290531658, + "learning_rate": 0.003, + "loss": 4.1283, + "step": 6428 + }, + { + "epoch": 0.06429, + "grad_norm": 0.6377825140551957, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 6429 + }, + { + "epoch": 0.0643, + "grad_norm": 0.5759747531004582, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 6430 + }, + { + "epoch": 0.06431, + "grad_norm": 0.5676646229844707, + "learning_rate": 0.003, + "loss": 4.114, + "step": 6431 + }, + { + "epoch": 0.06432, + "grad_norm": 0.5461669723653775, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 6432 + }, + { + "epoch": 0.06433, + "grad_norm": 0.45797081849907423, + "learning_rate": 0.003, + "loss": 4.13, + "step": 6433 + }, + { + "epoch": 0.06434, + "grad_norm": 0.40792521706337564, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 6434 + }, + { + "epoch": 0.06435, + "grad_norm": 0.4730517544060023, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 6435 + }, + { + "epoch": 0.06436, + "grad_norm": 0.4766134688447076, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 6436 + }, + { + "epoch": 0.06437, + "grad_norm": 0.45378874087374643, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 6437 + }, + { + "epoch": 0.06438, + "grad_norm": 0.44043582431299527, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 6438 + }, + { + "epoch": 0.06439, + "grad_norm": 0.46296521989188577, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 6439 + }, + { + "epoch": 0.0644, + "grad_norm": 0.43339108687839656, + "learning_rate": 0.003, + "loss": 4.095, + "step": 6440 + }, + { + "epoch": 0.06441, + "grad_norm": 0.5476027827440176, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 6441 + }, + { + "epoch": 0.06442, + "grad_norm": 0.7595327046230422, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 6442 + }, + { + "epoch": 0.06443, + "grad_norm": 0.8816633245265507, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 6443 + }, + { + "epoch": 0.06444, + "grad_norm": 0.7973460889122836, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 6444 + }, + { + "epoch": 0.06445, + "grad_norm": 0.6695114396139051, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 6445 + }, + { + "epoch": 0.06446, + "grad_norm": 0.7112577504861094, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 6446 + }, + { + "epoch": 0.06447, + "grad_norm": 0.6669997171392789, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 6447 + }, + { + "epoch": 0.06448, + "grad_norm": 0.7000529377796273, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 6448 + }, + { + "epoch": 0.06449, + "grad_norm": 0.7629296906277161, + "learning_rate": 0.003, + "loss": 4.1573, + "step": 6449 + }, + { + "epoch": 0.0645, + "grad_norm": 0.8151936619117285, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 6450 + }, + { + "epoch": 0.06451, + "grad_norm": 0.7790440723976773, + "learning_rate": 0.003, + "loss": 4.1546, + "step": 6451 + }, + { + "epoch": 0.06452, + "grad_norm": 0.7540024646037367, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 6452 + }, + { + "epoch": 0.06453, + "grad_norm": 0.7911961892915145, + "learning_rate": 0.003, + "loss": 4.1565, + "step": 6453 + }, + { + "epoch": 0.06454, + "grad_norm": 0.8237841593306827, + "learning_rate": 0.003, + "loss": 4.1173, + "step": 6454 + }, + { + "epoch": 0.06455, + "grad_norm": 0.8231550575684212, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 6455 + }, + { + "epoch": 0.06456, + "grad_norm": 0.7811541896303539, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 6456 + }, + { + "epoch": 0.06457, + "grad_norm": 0.8750658750124761, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 6457 + }, + { + "epoch": 0.06458, + "grad_norm": 0.9462957109484857, + "learning_rate": 0.003, + "loss": 4.1368, + "step": 6458 + }, + { + "epoch": 0.06459, + "grad_norm": 0.951508448401663, + "learning_rate": 0.003, + "loss": 4.1525, + "step": 6459 + }, + { + "epoch": 0.0646, + "grad_norm": 0.7887348617904948, + "learning_rate": 0.003, + "loss": 4.1259, + "step": 6460 + }, + { + "epoch": 0.06461, + "grad_norm": 0.6561119853474549, + "learning_rate": 0.003, + "loss": 4.1643, + "step": 6461 + }, + { + "epoch": 0.06462, + "grad_norm": 0.6564843031193351, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 6462 + }, + { + "epoch": 0.06463, + "grad_norm": 0.6866574328355095, + "learning_rate": 0.003, + "loss": 4.1138, + "step": 6463 + }, + { + "epoch": 0.06464, + "grad_norm": 0.7412943846552807, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 6464 + }, + { + "epoch": 0.06465, + "grad_norm": 0.8607666448452919, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 6465 + }, + { + "epoch": 0.06466, + "grad_norm": 0.9805980509960028, + "learning_rate": 0.003, + "loss": 4.1351, + "step": 6466 + }, + { + "epoch": 0.06467, + "grad_norm": 0.8240025115330769, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 6467 + }, + { + "epoch": 0.06468, + "grad_norm": 0.6915322018007451, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 6468 + }, + { + "epoch": 0.06469, + "grad_norm": 0.758205250001039, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 6469 + }, + { + "epoch": 0.0647, + "grad_norm": 0.6894610388291892, + "learning_rate": 0.003, + "loss": 4.1469, + "step": 6470 + }, + { + "epoch": 0.06471, + "grad_norm": 0.5734326985005797, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 6471 + }, + { + "epoch": 0.06472, + "grad_norm": 0.6006963939375836, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 6472 + }, + { + "epoch": 0.06473, + "grad_norm": 0.5903102918749725, + "learning_rate": 0.003, + "loss": 4.1342, + "step": 6473 + }, + { + "epoch": 0.06474, + "grad_norm": 0.6245812199248715, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 6474 + }, + { + "epoch": 0.06475, + "grad_norm": 0.6385947320236768, + "learning_rate": 0.003, + "loss": 4.1448, + "step": 6475 + }, + { + "epoch": 0.06476, + "grad_norm": 0.5964051582023622, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 6476 + }, + { + "epoch": 0.06477, + "grad_norm": 0.4545337714425668, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 6477 + }, + { + "epoch": 0.06478, + "grad_norm": 0.4067813603673349, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 6478 + }, + { + "epoch": 0.06479, + "grad_norm": 0.35829021719441345, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 6479 + }, + { + "epoch": 0.0648, + "grad_norm": 0.36043755979794306, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 6480 + }, + { + "epoch": 0.06481, + "grad_norm": 0.379368972959528, + "learning_rate": 0.003, + "loss": 4.102, + "step": 6481 + }, + { + "epoch": 0.06482, + "grad_norm": 0.3991318139475991, + "learning_rate": 0.003, + "loss": 4.1, + "step": 6482 + }, + { + "epoch": 0.06483, + "grad_norm": 0.5102756201687999, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 6483 + }, + { + "epoch": 0.06484, + "grad_norm": 0.6747208649376389, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 6484 + }, + { + "epoch": 0.06485, + "grad_norm": 0.818751842126756, + "learning_rate": 0.003, + "loss": 4.1385, + "step": 6485 + }, + { + "epoch": 0.06486, + "grad_norm": 0.8421941035958429, + "learning_rate": 0.003, + "loss": 4.139, + "step": 6486 + }, + { + "epoch": 0.06487, + "grad_norm": 0.8130967059681043, + "learning_rate": 0.003, + "loss": 4.1268, + "step": 6487 + }, + { + "epoch": 0.06488, + "grad_norm": 0.8190321300487033, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 6488 + }, + { + "epoch": 0.06489, + "grad_norm": 0.8618794228209637, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 6489 + }, + { + "epoch": 0.0649, + "grad_norm": 0.8167067584812114, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 6490 + }, + { + "epoch": 0.06491, + "grad_norm": 0.7120159141814162, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 6491 + }, + { + "epoch": 0.06492, + "grad_norm": 0.6966415188190852, + "learning_rate": 0.003, + "loss": 4.1358, + "step": 6492 + }, + { + "epoch": 0.06493, + "grad_norm": 0.6279768739419431, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 6493 + }, + { + "epoch": 0.06494, + "grad_norm": 0.6729099375336287, + "learning_rate": 0.003, + "loss": 4.1301, + "step": 6494 + }, + { + "epoch": 0.06495, + "grad_norm": 0.7138117731723219, + "learning_rate": 0.003, + "loss": 4.1234, + "step": 6495 + }, + { + "epoch": 0.06496, + "grad_norm": 0.7481765813018704, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 6496 + }, + { + "epoch": 0.06497, + "grad_norm": 0.6676385968633429, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 6497 + }, + { + "epoch": 0.06498, + "grad_norm": 0.5497380735072145, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 6498 + }, + { + "epoch": 0.06499, + "grad_norm": 0.5123283023400911, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 6499 + }, + { + "epoch": 0.065, + "grad_norm": 0.4909211309507681, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 6500 + }, + { + "epoch": 0.06501, + "grad_norm": 0.47186804780732317, + "learning_rate": 0.003, + "loss": 4.1, + "step": 6501 + }, + { + "epoch": 0.06502, + "grad_norm": 0.4586207578169618, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 6502 + }, + { + "epoch": 0.06503, + "grad_norm": 0.4734551401435721, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 6503 + }, + { + "epoch": 0.06504, + "grad_norm": 0.4518516829518515, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 6504 + }, + { + "epoch": 0.06505, + "grad_norm": 0.3926267599626706, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 6505 + }, + { + "epoch": 0.06506, + "grad_norm": 0.4116861037565272, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 6506 + }, + { + "epoch": 0.06507, + "grad_norm": 0.4869551083875778, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 6507 + }, + { + "epoch": 0.06508, + "grad_norm": 0.595601444957384, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 6508 + }, + { + "epoch": 0.06509, + "grad_norm": 0.8176861477648554, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 6509 + }, + { + "epoch": 0.0651, + "grad_norm": 0.9793486633870401, + "learning_rate": 0.003, + "loss": 4.1399, + "step": 6510 + }, + { + "epoch": 0.06511, + "grad_norm": 1.101260179163563, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 6511 + }, + { + "epoch": 0.06512, + "grad_norm": 0.7048089149615372, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 6512 + }, + { + "epoch": 0.06513, + "grad_norm": 0.6606285760107887, + "learning_rate": 0.003, + "loss": 4.118, + "step": 6513 + }, + { + "epoch": 0.06514, + "grad_norm": 0.6936176900884794, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 6514 + }, + { + "epoch": 0.06515, + "grad_norm": 0.5802739054552889, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 6515 + }, + { + "epoch": 0.06516, + "grad_norm": 0.5957435080730257, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 6516 + }, + { + "epoch": 0.06517, + "grad_norm": 0.6026894638436581, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 6517 + }, + { + "epoch": 0.06518, + "grad_norm": 0.6442432050579253, + "learning_rate": 0.003, + "loss": 4.131, + "step": 6518 + }, + { + "epoch": 0.06519, + "grad_norm": 0.6958415784014198, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 6519 + }, + { + "epoch": 0.0652, + "grad_norm": 0.687200073329854, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 6520 + }, + { + "epoch": 0.06521, + "grad_norm": 0.6695110160765739, + "learning_rate": 0.003, + "loss": 4.126, + "step": 6521 + }, + { + "epoch": 0.06522, + "grad_norm": 0.6730631239357413, + "learning_rate": 0.003, + "loss": 4.1268, + "step": 6522 + }, + { + "epoch": 0.06523, + "grad_norm": 0.6680040550931838, + "learning_rate": 0.003, + "loss": 4.116, + "step": 6523 + }, + { + "epoch": 0.06524, + "grad_norm": 0.7000701390808372, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 6524 + }, + { + "epoch": 0.06525, + "grad_norm": 0.6591874414899291, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 6525 + }, + { + "epoch": 0.06526, + "grad_norm": 0.5974142684472702, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 6526 + }, + { + "epoch": 0.06527, + "grad_norm": 0.629624470137727, + "learning_rate": 0.003, + "loss": 4.123, + "step": 6527 + }, + { + "epoch": 0.06528, + "grad_norm": 0.7082086868906479, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 6528 + }, + { + "epoch": 0.06529, + "grad_norm": 0.7547993486095116, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 6529 + }, + { + "epoch": 0.0653, + "grad_norm": 0.7160694526225951, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 6530 + }, + { + "epoch": 0.06531, + "grad_norm": 0.7091031625495451, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 6531 + }, + { + "epoch": 0.06532, + "grad_norm": 0.5960779659612676, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 6532 + }, + { + "epoch": 0.06533, + "grad_norm": 0.569584737632218, + "learning_rate": 0.003, + "loss": 4.1241, + "step": 6533 + }, + { + "epoch": 0.06534, + "grad_norm": 0.47053846099049507, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 6534 + }, + { + "epoch": 0.06535, + "grad_norm": 0.4876438724766613, + "learning_rate": 0.003, + "loss": 4.074, + "step": 6535 + }, + { + "epoch": 0.06536, + "grad_norm": 0.5421075581742479, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 6536 + }, + { + "epoch": 0.06537, + "grad_norm": 0.6138808767175822, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 6537 + }, + { + "epoch": 0.06538, + "grad_norm": 0.681850513199098, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 6538 + }, + { + "epoch": 0.06539, + "grad_norm": 0.7093467740008924, + "learning_rate": 0.003, + "loss": 4.089, + "step": 6539 + }, + { + "epoch": 0.0654, + "grad_norm": 0.6796947934262606, + "learning_rate": 0.003, + "loss": 4.1443, + "step": 6540 + }, + { + "epoch": 0.06541, + "grad_norm": 0.5895536662879667, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 6541 + }, + { + "epoch": 0.06542, + "grad_norm": 0.5887252594422324, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 6542 + }, + { + "epoch": 0.06543, + "grad_norm": 0.6070666236587158, + "learning_rate": 0.003, + "loss": 4.119, + "step": 6543 + }, + { + "epoch": 0.06544, + "grad_norm": 0.6336458361165094, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 6544 + }, + { + "epoch": 0.06545, + "grad_norm": 0.5994933552644387, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 6545 + }, + { + "epoch": 0.06546, + "grad_norm": 0.5861867238542918, + "learning_rate": 0.003, + "loss": 4.099, + "step": 6546 + }, + { + "epoch": 0.06547, + "grad_norm": 0.7675266494691649, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 6547 + }, + { + "epoch": 0.06548, + "grad_norm": 0.7584810341137693, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 6548 + }, + { + "epoch": 0.06549, + "grad_norm": 0.7244422065216164, + "learning_rate": 0.003, + "loss": 4.1372, + "step": 6549 + }, + { + "epoch": 0.0655, + "grad_norm": 0.7864205468003803, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 6550 + }, + { + "epoch": 0.06551, + "grad_norm": 0.7433546549614126, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 6551 + }, + { + "epoch": 0.06552, + "grad_norm": 0.8758299277222058, + "learning_rate": 0.003, + "loss": 4.142, + "step": 6552 + }, + { + "epoch": 0.06553, + "grad_norm": 0.981060258172319, + "learning_rate": 0.003, + "loss": 4.1274, + "step": 6553 + }, + { + "epoch": 0.06554, + "grad_norm": 1.0836670305325147, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 6554 + }, + { + "epoch": 0.06555, + "grad_norm": 0.893552519172031, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 6555 + }, + { + "epoch": 0.06556, + "grad_norm": 0.8133887797335478, + "learning_rate": 0.003, + "loss": 4.1481, + "step": 6556 + }, + { + "epoch": 0.06557, + "grad_norm": 0.8038068218784429, + "learning_rate": 0.003, + "loss": 4.125, + "step": 6557 + }, + { + "epoch": 0.06558, + "grad_norm": 0.8563243567849327, + "learning_rate": 0.003, + "loss": 4.1577, + "step": 6558 + }, + { + "epoch": 0.06559, + "grad_norm": 0.8357600944003225, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 6559 + }, + { + "epoch": 0.0656, + "grad_norm": 0.7471745091465914, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 6560 + }, + { + "epoch": 0.06561, + "grad_norm": 0.7197252510341869, + "learning_rate": 0.003, + "loss": 4.1523, + "step": 6561 + }, + { + "epoch": 0.06562, + "grad_norm": 0.6961222717520075, + "learning_rate": 0.003, + "loss": 4.1055, + "step": 6562 + }, + { + "epoch": 0.06563, + "grad_norm": 0.7969065869116986, + "learning_rate": 0.003, + "loss": 4.129, + "step": 6563 + }, + { + "epoch": 0.06564, + "grad_norm": 0.8609617020319454, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 6564 + }, + { + "epoch": 0.06565, + "grad_norm": 0.7706054033095626, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 6565 + }, + { + "epoch": 0.06566, + "grad_norm": 0.736160028183247, + "learning_rate": 0.003, + "loss": 4.1315, + "step": 6566 + }, + { + "epoch": 0.06567, + "grad_norm": 0.8827507523317322, + "learning_rate": 0.003, + "loss": 4.1502, + "step": 6567 + }, + { + "epoch": 0.06568, + "grad_norm": 0.9730014527594969, + "learning_rate": 0.003, + "loss": 4.1614, + "step": 6568 + }, + { + "epoch": 0.06569, + "grad_norm": 0.8987588032009837, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 6569 + }, + { + "epoch": 0.0657, + "grad_norm": 0.7480401403974306, + "learning_rate": 0.003, + "loss": 4.131, + "step": 6570 + }, + { + "epoch": 0.06571, + "grad_norm": 0.5934268593799419, + "learning_rate": 0.003, + "loss": 4.1311, + "step": 6571 + }, + { + "epoch": 0.06572, + "grad_norm": 0.6585081494040603, + "learning_rate": 0.003, + "loss": 4.099, + "step": 6572 + }, + { + "epoch": 0.06573, + "grad_norm": 0.7295365583733733, + "learning_rate": 0.003, + "loss": 4.13, + "step": 6573 + }, + { + "epoch": 0.06574, + "grad_norm": 0.756954183791856, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 6574 + }, + { + "epoch": 0.06575, + "grad_norm": 0.7099875197144789, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 6575 + }, + { + "epoch": 0.06576, + "grad_norm": 0.6986413932325001, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 6576 + }, + { + "epoch": 0.06577, + "grad_norm": 0.6408480221530382, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 6577 + }, + { + "epoch": 0.06578, + "grad_norm": 0.5861819343962567, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 6578 + }, + { + "epoch": 0.06579, + "grad_norm": 0.5772678504283916, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 6579 + }, + { + "epoch": 0.0658, + "grad_norm": 0.4907489111870778, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 6580 + }, + { + "epoch": 0.06581, + "grad_norm": 0.4803454909907143, + "learning_rate": 0.003, + "loss": 4.1292, + "step": 6581 + }, + { + "epoch": 0.06582, + "grad_norm": 0.5180246165607337, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 6582 + }, + { + "epoch": 0.06583, + "grad_norm": 0.63192504153945, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 6583 + }, + { + "epoch": 0.06584, + "grad_norm": 0.7212896215769731, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 6584 + }, + { + "epoch": 0.06585, + "grad_norm": 0.7725745929887623, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 6585 + }, + { + "epoch": 0.06586, + "grad_norm": 0.7150680524436269, + "learning_rate": 0.003, + "loss": 4.1259, + "step": 6586 + }, + { + "epoch": 0.06587, + "grad_norm": 0.6454511286444826, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 6587 + }, + { + "epoch": 0.06588, + "grad_norm": 0.6209496060701972, + "learning_rate": 0.003, + "loss": 4.1439, + "step": 6588 + }, + { + "epoch": 0.06589, + "grad_norm": 0.643641834535616, + "learning_rate": 0.003, + "loss": 4.134, + "step": 6589 + }, + { + "epoch": 0.0659, + "grad_norm": 0.6545001554853087, + "learning_rate": 0.003, + "loss": 4.1411, + "step": 6590 + }, + { + "epoch": 0.06591, + "grad_norm": 0.6756223183950393, + "learning_rate": 0.003, + "loss": 4.162, + "step": 6591 + }, + { + "epoch": 0.06592, + "grad_norm": 0.6681680196479507, + "learning_rate": 0.003, + "loss": 4.1265, + "step": 6592 + }, + { + "epoch": 0.06593, + "grad_norm": 0.6370540979582453, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 6593 + }, + { + "epoch": 0.06594, + "grad_norm": 0.6592450103168598, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 6594 + }, + { + "epoch": 0.06595, + "grad_norm": 0.6488902280727541, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 6595 + }, + { + "epoch": 0.06596, + "grad_norm": 0.6078864404662637, + "learning_rate": 0.003, + "loss": 4.1304, + "step": 6596 + }, + { + "epoch": 0.06597, + "grad_norm": 0.5826732446122348, + "learning_rate": 0.003, + "loss": 4.105, + "step": 6597 + }, + { + "epoch": 0.06598, + "grad_norm": 0.5786447854885438, + "learning_rate": 0.003, + "loss": 4.1355, + "step": 6598 + }, + { + "epoch": 0.06599, + "grad_norm": 0.5203150176294418, + "learning_rate": 0.003, + "loss": 4.1165, + "step": 6599 + }, + { + "epoch": 0.066, + "grad_norm": 0.5194458831005944, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 6600 + }, + { + "epoch": 0.06601, + "grad_norm": 0.5125280130771507, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 6601 + }, + { + "epoch": 0.06602, + "grad_norm": 0.5255374937732523, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 6602 + }, + { + "epoch": 0.06603, + "grad_norm": 0.5756998423978795, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 6603 + }, + { + "epoch": 0.06604, + "grad_norm": 0.6322066603487145, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 6604 + }, + { + "epoch": 0.06605, + "grad_norm": 0.6466004843165204, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 6605 + }, + { + "epoch": 0.06606, + "grad_norm": 0.7167353080028168, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 6606 + }, + { + "epoch": 0.06607, + "grad_norm": 0.8469413810873132, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 6607 + }, + { + "epoch": 0.06608, + "grad_norm": 0.7995923933103859, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 6608 + }, + { + "epoch": 0.06609, + "grad_norm": 0.681750060363617, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 6609 + }, + { + "epoch": 0.0661, + "grad_norm": 0.6480858792190892, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 6610 + }, + { + "epoch": 0.06611, + "grad_norm": 0.658286589419674, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 6611 + }, + { + "epoch": 0.06612, + "grad_norm": 0.6793208367131326, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 6612 + }, + { + "epoch": 0.06613, + "grad_norm": 0.6454677330588496, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 6613 + }, + { + "epoch": 0.06614, + "grad_norm": 0.6649584285769843, + "learning_rate": 0.003, + "loss": 4.1145, + "step": 6614 + }, + { + "epoch": 0.06615, + "grad_norm": 0.62805829678068, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 6615 + }, + { + "epoch": 0.06616, + "grad_norm": 0.5438812377408372, + "learning_rate": 0.003, + "loss": 4.1439, + "step": 6616 + }, + { + "epoch": 0.06617, + "grad_norm": 0.5527092065688209, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 6617 + }, + { + "epoch": 0.06618, + "grad_norm": 0.6122283269442648, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 6618 + }, + { + "epoch": 0.06619, + "grad_norm": 0.742890868164231, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 6619 + }, + { + "epoch": 0.0662, + "grad_norm": 0.7283542861441105, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 6620 + }, + { + "epoch": 0.06621, + "grad_norm": 0.5718678667246628, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 6621 + }, + { + "epoch": 0.06622, + "grad_norm": 0.5903274463487566, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 6622 + }, + { + "epoch": 0.06623, + "grad_norm": 0.6512429289105603, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 6623 + }, + { + "epoch": 0.06624, + "grad_norm": 0.6598637861946588, + "learning_rate": 0.003, + "loss": 4.1301, + "step": 6624 + }, + { + "epoch": 0.06625, + "grad_norm": 0.6856917785450871, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 6625 + }, + { + "epoch": 0.06626, + "grad_norm": 0.6780772216393701, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 6626 + }, + { + "epoch": 0.06627, + "grad_norm": 0.6760700441536392, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 6627 + }, + { + "epoch": 0.06628, + "grad_norm": 0.6093236296424669, + "learning_rate": 0.003, + "loss": 4.122, + "step": 6628 + }, + { + "epoch": 0.06629, + "grad_norm": 0.54950173826257, + "learning_rate": 0.003, + "loss": 4.1162, + "step": 6629 + }, + { + "epoch": 0.0663, + "grad_norm": 0.5194281298880598, + "learning_rate": 0.003, + "loss": 4.1318, + "step": 6630 + }, + { + "epoch": 0.06631, + "grad_norm": 0.4311499243556024, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 6631 + }, + { + "epoch": 0.06632, + "grad_norm": 0.4475581432730069, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 6632 + }, + { + "epoch": 0.06633, + "grad_norm": 0.5189111671509972, + "learning_rate": 0.003, + "loss": 4.1254, + "step": 6633 + }, + { + "epoch": 0.06634, + "grad_norm": 0.6506213384335201, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 6634 + }, + { + "epoch": 0.06635, + "grad_norm": 0.951821788755601, + "learning_rate": 0.003, + "loss": 4.1452, + "step": 6635 + }, + { + "epoch": 0.06636, + "grad_norm": 1.1753961378034505, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 6636 + }, + { + "epoch": 0.06637, + "grad_norm": 0.8274131958669491, + "learning_rate": 0.003, + "loss": 4.1253, + "step": 6637 + }, + { + "epoch": 0.06638, + "grad_norm": 0.9087736611088357, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 6638 + }, + { + "epoch": 0.06639, + "grad_norm": 0.8594515515830997, + "learning_rate": 0.003, + "loss": 4.1355, + "step": 6639 + }, + { + "epoch": 0.0664, + "grad_norm": 0.7981389214332112, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 6640 + }, + { + "epoch": 0.06641, + "grad_norm": 0.7337294828850577, + "learning_rate": 0.003, + "loss": 4.1325, + "step": 6641 + }, + { + "epoch": 0.06642, + "grad_norm": 0.6533577770906497, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 6642 + }, + { + "epoch": 0.06643, + "grad_norm": 0.7837625761612944, + "learning_rate": 0.003, + "loss": 4.1251, + "step": 6643 + }, + { + "epoch": 0.06644, + "grad_norm": 0.7304936419918611, + "learning_rate": 0.003, + "loss": 4.1262, + "step": 6644 + }, + { + "epoch": 0.06645, + "grad_norm": 0.7132250314025796, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 6645 + }, + { + "epoch": 0.06646, + "grad_norm": 0.7488926225230269, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 6646 + }, + { + "epoch": 0.06647, + "grad_norm": 0.793819493883866, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 6647 + }, + { + "epoch": 0.06648, + "grad_norm": 0.8562701883893814, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 6648 + }, + { + "epoch": 0.06649, + "grad_norm": 0.8550813838279994, + "learning_rate": 0.003, + "loss": 4.1347, + "step": 6649 + }, + { + "epoch": 0.0665, + "grad_norm": 0.921797160027487, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 6650 + }, + { + "epoch": 0.06651, + "grad_norm": 0.8818469027010972, + "learning_rate": 0.003, + "loss": 4.099, + "step": 6651 + }, + { + "epoch": 0.06652, + "grad_norm": 0.8146093740190103, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 6652 + }, + { + "epoch": 0.06653, + "grad_norm": 0.7944236365667351, + "learning_rate": 0.003, + "loss": 4.1526, + "step": 6653 + }, + { + "epoch": 0.06654, + "grad_norm": 0.8268460125317942, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 6654 + }, + { + "epoch": 0.06655, + "grad_norm": 0.8093935411338872, + "learning_rate": 0.003, + "loss": 4.1337, + "step": 6655 + }, + { + "epoch": 0.06656, + "grad_norm": 0.7794404155644764, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 6656 + }, + { + "epoch": 0.06657, + "grad_norm": 0.867714800249613, + "learning_rate": 0.003, + "loss": 4.1413, + "step": 6657 + }, + { + "epoch": 0.06658, + "grad_norm": 0.9142624375781937, + "learning_rate": 0.003, + "loss": 4.1351, + "step": 6658 + }, + { + "epoch": 0.06659, + "grad_norm": 0.9521474355283531, + "learning_rate": 0.003, + "loss": 4.1507, + "step": 6659 + }, + { + "epoch": 0.0666, + "grad_norm": 0.9916432590288042, + "learning_rate": 0.003, + "loss": 4.1409, + "step": 6660 + }, + { + "epoch": 0.06661, + "grad_norm": 0.7955314202862873, + "learning_rate": 0.003, + "loss": 4.1292, + "step": 6661 + }, + { + "epoch": 0.06662, + "grad_norm": 0.610721250942562, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 6662 + }, + { + "epoch": 0.06663, + "grad_norm": 0.6122676305957936, + "learning_rate": 0.003, + "loss": 4.182, + "step": 6663 + }, + { + "epoch": 0.06664, + "grad_norm": 0.5620102202705014, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 6664 + }, + { + "epoch": 0.06665, + "grad_norm": 0.5333334036557247, + "learning_rate": 0.003, + "loss": 4.1608, + "step": 6665 + }, + { + "epoch": 0.06666, + "grad_norm": 0.4779105794437428, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 6666 + }, + { + "epoch": 0.06667, + "grad_norm": 0.4623720974757374, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 6667 + }, + { + "epoch": 0.06668, + "grad_norm": 0.43970305974094764, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 6668 + }, + { + "epoch": 0.06669, + "grad_norm": 0.4262049724870593, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 6669 + }, + { + "epoch": 0.0667, + "grad_norm": 0.43473058147660226, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 6670 + }, + { + "epoch": 0.06671, + "grad_norm": 0.4811373905144156, + "learning_rate": 0.003, + "loss": 4.1153, + "step": 6671 + }, + { + "epoch": 0.06672, + "grad_norm": 0.48294732369579685, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 6672 + }, + { + "epoch": 0.06673, + "grad_norm": 0.43487438428595204, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 6673 + }, + { + "epoch": 0.06674, + "grad_norm": 0.42810600796813436, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 6674 + }, + { + "epoch": 0.06675, + "grad_norm": 0.4187127469658615, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 6675 + }, + { + "epoch": 0.06676, + "grad_norm": 0.40648645104736636, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 6676 + }, + { + "epoch": 0.06677, + "grad_norm": 0.45239784588879955, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 6677 + }, + { + "epoch": 0.06678, + "grad_norm": 0.507095938002501, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 6678 + }, + { + "epoch": 0.06679, + "grad_norm": 0.5982462726379839, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 6679 + }, + { + "epoch": 0.0668, + "grad_norm": 0.7085661257950622, + "learning_rate": 0.003, + "loss": 4.1394, + "step": 6680 + }, + { + "epoch": 0.06681, + "grad_norm": 0.7670644647417465, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 6681 + }, + { + "epoch": 0.06682, + "grad_norm": 0.666267524204535, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 6682 + }, + { + "epoch": 0.06683, + "grad_norm": 0.5601369401979518, + "learning_rate": 0.003, + "loss": 4.1063, + "step": 6683 + }, + { + "epoch": 0.06684, + "grad_norm": 0.6101758101844771, + "learning_rate": 0.003, + "loss": 4.1121, + "step": 6684 + }, + { + "epoch": 0.06685, + "grad_norm": 0.7821140255267783, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 6685 + }, + { + "epoch": 0.06686, + "grad_norm": 0.870444885006984, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 6686 + }, + { + "epoch": 0.06687, + "grad_norm": 0.885507452709397, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 6687 + }, + { + "epoch": 0.06688, + "grad_norm": 0.7868701754781342, + "learning_rate": 0.003, + "loss": 4.104, + "step": 6688 + }, + { + "epoch": 0.06689, + "grad_norm": 0.8462898511590071, + "learning_rate": 0.003, + "loss": 4.1453, + "step": 6689 + }, + { + "epoch": 0.0669, + "grad_norm": 0.7964646938445537, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 6690 + }, + { + "epoch": 0.06691, + "grad_norm": 0.7632100671298492, + "learning_rate": 0.003, + "loss": 4.1254, + "step": 6691 + }, + { + "epoch": 0.06692, + "grad_norm": 0.7875561157456419, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 6692 + }, + { + "epoch": 0.06693, + "grad_norm": 0.8456951569948422, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 6693 + }, + { + "epoch": 0.06694, + "grad_norm": 0.8930574483409586, + "learning_rate": 0.003, + "loss": 4.1239, + "step": 6694 + }, + { + "epoch": 0.06695, + "grad_norm": 0.7514288949015006, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 6695 + }, + { + "epoch": 0.06696, + "grad_norm": 0.7702907982422175, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 6696 + }, + { + "epoch": 0.06697, + "grad_norm": 0.736140406934018, + "learning_rate": 0.003, + "loss": 4.1304, + "step": 6697 + }, + { + "epoch": 0.06698, + "grad_norm": 0.7509814571017203, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 6698 + }, + { + "epoch": 0.06699, + "grad_norm": 0.6631263407471134, + "learning_rate": 0.003, + "loss": 4.115, + "step": 6699 + }, + { + "epoch": 0.067, + "grad_norm": 0.5871001542490745, + "learning_rate": 0.003, + "loss": 4.1297, + "step": 6700 + }, + { + "epoch": 0.06701, + "grad_norm": 0.5435734122195582, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 6701 + }, + { + "epoch": 0.06702, + "grad_norm": 0.5252277352718284, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 6702 + }, + { + "epoch": 0.06703, + "grad_norm": 0.6019201583785915, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 6703 + }, + { + "epoch": 0.06704, + "grad_norm": 0.6629860196845838, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 6704 + }, + { + "epoch": 0.06705, + "grad_norm": 0.7076318403084291, + "learning_rate": 0.003, + "loss": 4.108, + "step": 6705 + }, + { + "epoch": 0.06706, + "grad_norm": 0.7718630424234669, + "learning_rate": 0.003, + "loss": 4.1562, + "step": 6706 + }, + { + "epoch": 0.06707, + "grad_norm": 0.735500368867454, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 6707 + }, + { + "epoch": 0.06708, + "grad_norm": 0.6126034855042024, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 6708 + }, + { + "epoch": 0.06709, + "grad_norm": 0.5356173594265632, + "learning_rate": 0.003, + "loss": 4.1, + "step": 6709 + }, + { + "epoch": 0.0671, + "grad_norm": 0.4923039482946903, + "learning_rate": 0.003, + "loss": 4.1229, + "step": 6710 + }, + { + "epoch": 0.06711, + "grad_norm": 0.496437675576904, + "learning_rate": 0.003, + "loss": 4.1316, + "step": 6711 + }, + { + "epoch": 0.06712, + "grad_norm": 0.6267076354025424, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 6712 + }, + { + "epoch": 0.06713, + "grad_norm": 0.810419464520027, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 6713 + }, + { + "epoch": 0.06714, + "grad_norm": 1.0033614640972555, + "learning_rate": 0.003, + "loss": 4.1182, + "step": 6714 + }, + { + "epoch": 0.06715, + "grad_norm": 1.034862211372409, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 6715 + }, + { + "epoch": 0.06716, + "grad_norm": 0.7891873870335114, + "learning_rate": 0.003, + "loss": 4.1258, + "step": 6716 + }, + { + "epoch": 0.06717, + "grad_norm": 0.7583795710672536, + "learning_rate": 0.003, + "loss": 4.1501, + "step": 6717 + }, + { + "epoch": 0.06718, + "grad_norm": 0.7115916620356093, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 6718 + }, + { + "epoch": 0.06719, + "grad_norm": 0.6809781091009515, + "learning_rate": 0.003, + "loss": 4.1068, + "step": 6719 + }, + { + "epoch": 0.0672, + "grad_norm": 0.7352768877036369, + "learning_rate": 0.003, + "loss": 4.1239, + "step": 6720 + }, + { + "epoch": 0.06721, + "grad_norm": 0.6315118398036524, + "learning_rate": 0.003, + "loss": 4.135, + "step": 6721 + }, + { + "epoch": 0.06722, + "grad_norm": 0.6185480236265406, + "learning_rate": 0.003, + "loss": 4.1388, + "step": 6722 + }, + { + "epoch": 0.06723, + "grad_norm": 0.5845982684430056, + "learning_rate": 0.003, + "loss": 4.1285, + "step": 6723 + }, + { + "epoch": 0.06724, + "grad_norm": 0.49159392452345974, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 6724 + }, + { + "epoch": 0.06725, + "grad_norm": 0.4513358433552748, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 6725 + }, + { + "epoch": 0.06726, + "grad_norm": 0.4366793957137453, + "learning_rate": 0.003, + "loss": 4.1429, + "step": 6726 + }, + { + "epoch": 0.06727, + "grad_norm": 0.45039616788114, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 6727 + }, + { + "epoch": 0.06728, + "grad_norm": 0.42960667974723327, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 6728 + }, + { + "epoch": 0.06729, + "grad_norm": 0.3936495109654878, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 6729 + }, + { + "epoch": 0.0673, + "grad_norm": 0.4004584193437624, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 6730 + }, + { + "epoch": 0.06731, + "grad_norm": 0.43331873995398884, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 6731 + }, + { + "epoch": 0.06732, + "grad_norm": 0.5033918683059013, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 6732 + }, + { + "epoch": 0.06733, + "grad_norm": 0.5679439839709435, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 6733 + }, + { + "epoch": 0.06734, + "grad_norm": 0.6741835879416314, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 6734 + }, + { + "epoch": 0.06735, + "grad_norm": 0.7951013050676844, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 6735 + }, + { + "epoch": 0.06736, + "grad_norm": 0.8142625336131883, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 6736 + }, + { + "epoch": 0.06737, + "grad_norm": 0.6923418050164777, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 6737 + }, + { + "epoch": 0.06738, + "grad_norm": 0.6417437188150639, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 6738 + }, + { + "epoch": 0.06739, + "grad_norm": 0.6666846912447963, + "learning_rate": 0.003, + "loss": 4.102, + "step": 6739 + }, + { + "epoch": 0.0674, + "grad_norm": 0.7113245183023263, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 6740 + }, + { + "epoch": 0.06741, + "grad_norm": 0.696453096464101, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 6741 + }, + { + "epoch": 0.06742, + "grad_norm": 0.6475796676339262, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 6742 + }, + { + "epoch": 0.06743, + "grad_norm": 0.6335412186431436, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 6743 + }, + { + "epoch": 0.06744, + "grad_norm": 0.5563393921556343, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 6744 + }, + { + "epoch": 0.06745, + "grad_norm": 0.5419204393939647, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 6745 + }, + { + "epoch": 0.06746, + "grad_norm": 0.6275067206076368, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 6746 + }, + { + "epoch": 0.06747, + "grad_norm": 0.6946085858510286, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 6747 + }, + { + "epoch": 0.06748, + "grad_norm": 0.7589007060444024, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 6748 + }, + { + "epoch": 0.06749, + "grad_norm": 0.8157554526066001, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 6749 + }, + { + "epoch": 0.0675, + "grad_norm": 0.7942374978397424, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 6750 + }, + { + "epoch": 0.06751, + "grad_norm": 0.8980523962537803, + "learning_rate": 0.003, + "loss": 4.1399, + "step": 6751 + }, + { + "epoch": 0.06752, + "grad_norm": 1.0269645356154302, + "learning_rate": 0.003, + "loss": 4.1455, + "step": 6752 + }, + { + "epoch": 0.06753, + "grad_norm": 0.9257798372553527, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 6753 + }, + { + "epoch": 0.06754, + "grad_norm": 0.7843345022720416, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 6754 + }, + { + "epoch": 0.06755, + "grad_norm": 0.8626796100641331, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 6755 + }, + { + "epoch": 0.06756, + "grad_norm": 0.813278917534869, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 6756 + }, + { + "epoch": 0.06757, + "grad_norm": 0.7512724737472484, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 6757 + }, + { + "epoch": 0.06758, + "grad_norm": 0.6469087254992117, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 6758 + }, + { + "epoch": 0.06759, + "grad_norm": 0.6436740764232052, + "learning_rate": 0.003, + "loss": 4.148, + "step": 6759 + }, + { + "epoch": 0.0676, + "grad_norm": 0.6943244361400559, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 6760 + }, + { + "epoch": 0.06761, + "grad_norm": 0.6830986749187081, + "learning_rate": 0.003, + "loss": 4.112, + "step": 6761 + }, + { + "epoch": 0.06762, + "grad_norm": 0.6781853474637783, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 6762 + }, + { + "epoch": 0.06763, + "grad_norm": 0.6506611393135219, + "learning_rate": 0.003, + "loss": 4.115, + "step": 6763 + }, + { + "epoch": 0.06764, + "grad_norm": 0.7027611096936427, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 6764 + }, + { + "epoch": 0.06765, + "grad_norm": 0.6155749614693148, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 6765 + }, + { + "epoch": 0.06766, + "grad_norm": 0.6035208063328635, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 6766 + }, + { + "epoch": 0.06767, + "grad_norm": 0.5420944911927922, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 6767 + }, + { + "epoch": 0.06768, + "grad_norm": 0.6137703731544314, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 6768 + }, + { + "epoch": 0.06769, + "grad_norm": 0.7672588192354372, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 6769 + }, + { + "epoch": 0.0677, + "grad_norm": 0.8517644016624661, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 6770 + }, + { + "epoch": 0.06771, + "grad_norm": 0.8225325274897667, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 6771 + }, + { + "epoch": 0.06772, + "grad_norm": 0.8285914075732632, + "learning_rate": 0.003, + "loss": 4.1242, + "step": 6772 + }, + { + "epoch": 0.06773, + "grad_norm": 0.7479732791491223, + "learning_rate": 0.003, + "loss": 4.1359, + "step": 6773 + }, + { + "epoch": 0.06774, + "grad_norm": 0.7939299101707836, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 6774 + }, + { + "epoch": 0.06775, + "grad_norm": 0.8020629017874903, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 6775 + }, + { + "epoch": 0.06776, + "grad_norm": 0.7932490440340019, + "learning_rate": 0.003, + "loss": 4.1527, + "step": 6776 + }, + { + "epoch": 0.06777, + "grad_norm": 0.794180803890291, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 6777 + }, + { + "epoch": 0.06778, + "grad_norm": 0.7991189828810096, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 6778 + }, + { + "epoch": 0.06779, + "grad_norm": 0.818786154364808, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 6779 + }, + { + "epoch": 0.0678, + "grad_norm": 0.712453918318275, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 6780 + }, + { + "epoch": 0.06781, + "grad_norm": 0.6542310526187651, + "learning_rate": 0.003, + "loss": 4.1549, + "step": 6781 + }, + { + "epoch": 0.06782, + "grad_norm": 0.6334297545638243, + "learning_rate": 0.003, + "loss": 4.1299, + "step": 6782 + }, + { + "epoch": 0.06783, + "grad_norm": 0.6222166600018698, + "learning_rate": 0.003, + "loss": 4.136, + "step": 6783 + }, + { + "epoch": 0.06784, + "grad_norm": 0.6173988646215278, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 6784 + }, + { + "epoch": 0.06785, + "grad_norm": 0.6089756191279135, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 6785 + }, + { + "epoch": 0.06786, + "grad_norm": 0.5533280430635569, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 6786 + }, + { + "epoch": 0.06787, + "grad_norm": 0.5466977622748588, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 6787 + }, + { + "epoch": 0.06788, + "grad_norm": 0.5709118469585935, + "learning_rate": 0.003, + "loss": 4.1259, + "step": 6788 + }, + { + "epoch": 0.06789, + "grad_norm": 0.43615212913657647, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 6789 + }, + { + "epoch": 0.0679, + "grad_norm": 0.4445553735594529, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 6790 + }, + { + "epoch": 0.06791, + "grad_norm": 0.4404711279990138, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 6791 + }, + { + "epoch": 0.06792, + "grad_norm": 0.46477638175762537, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 6792 + }, + { + "epoch": 0.06793, + "grad_norm": 0.48722297592983815, + "learning_rate": 0.003, + "loss": 4.1119, + "step": 6793 + }, + { + "epoch": 0.06794, + "grad_norm": 0.4979919688575191, + "learning_rate": 0.003, + "loss": 4.1275, + "step": 6794 + }, + { + "epoch": 0.06795, + "grad_norm": 0.49093393128752905, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 6795 + }, + { + "epoch": 0.06796, + "grad_norm": 0.5892149599358569, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 6796 + }, + { + "epoch": 0.06797, + "grad_norm": 0.7982288521309417, + "learning_rate": 0.003, + "loss": 4.134, + "step": 6797 + }, + { + "epoch": 0.06798, + "grad_norm": 0.9112336477160669, + "learning_rate": 0.003, + "loss": 4.1053, + "step": 6798 + }, + { + "epoch": 0.06799, + "grad_norm": 0.8816688090009441, + "learning_rate": 0.003, + "loss": 4.1463, + "step": 6799 + }, + { + "epoch": 0.068, + "grad_norm": 0.8279620352251622, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 6800 + }, + { + "epoch": 0.06801, + "grad_norm": 0.8613030262720641, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 6801 + }, + { + "epoch": 0.06802, + "grad_norm": 0.9654181970654568, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 6802 + }, + { + "epoch": 0.06803, + "grad_norm": 0.7831222168974614, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 6803 + }, + { + "epoch": 0.06804, + "grad_norm": 0.7859386559785015, + "learning_rate": 0.003, + "loss": 4.137, + "step": 6804 + }, + { + "epoch": 0.06805, + "grad_norm": 0.7680250323768286, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 6805 + }, + { + "epoch": 0.06806, + "grad_norm": 0.7789471170116108, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 6806 + }, + { + "epoch": 0.06807, + "grad_norm": 0.6389939368418367, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 6807 + }, + { + "epoch": 0.06808, + "grad_norm": 0.708633773967604, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 6808 + }, + { + "epoch": 0.06809, + "grad_norm": 0.7180318843790542, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 6809 + }, + { + "epoch": 0.0681, + "grad_norm": 0.7301766381754865, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 6810 + }, + { + "epoch": 0.06811, + "grad_norm": 0.7471263469142446, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 6811 + }, + { + "epoch": 0.06812, + "grad_norm": 0.7142209108867097, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 6812 + }, + { + "epoch": 0.06813, + "grad_norm": 0.6372769592641551, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 6813 + }, + { + "epoch": 0.06814, + "grad_norm": 0.6379955586156076, + "learning_rate": 0.003, + "loss": 4.1221, + "step": 6814 + }, + { + "epoch": 0.06815, + "grad_norm": 0.6038210649447708, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 6815 + }, + { + "epoch": 0.06816, + "grad_norm": 0.5751849767984581, + "learning_rate": 0.003, + "loss": 4.1268, + "step": 6816 + }, + { + "epoch": 0.06817, + "grad_norm": 0.5359776355709539, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 6817 + }, + { + "epoch": 0.06818, + "grad_norm": 0.5146004463623628, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 6818 + }, + { + "epoch": 0.06819, + "grad_norm": 0.5178577295105196, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 6819 + }, + { + "epoch": 0.0682, + "grad_norm": 0.5036407285943472, + "learning_rate": 0.003, + "loss": 4.1358, + "step": 6820 + }, + { + "epoch": 0.06821, + "grad_norm": 0.5531066591245212, + "learning_rate": 0.003, + "loss": 4.1271, + "step": 6821 + }, + { + "epoch": 0.06822, + "grad_norm": 0.6470177042281421, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 6822 + }, + { + "epoch": 0.06823, + "grad_norm": 0.6539637525535354, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 6823 + }, + { + "epoch": 0.06824, + "grad_norm": 0.7493448106054319, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 6824 + }, + { + "epoch": 0.06825, + "grad_norm": 0.7477095595533654, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 6825 + }, + { + "epoch": 0.06826, + "grad_norm": 0.616205641208411, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 6826 + }, + { + "epoch": 0.06827, + "grad_norm": 0.5343803407507347, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 6827 + }, + { + "epoch": 0.06828, + "grad_norm": 0.5516727566441831, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 6828 + }, + { + "epoch": 0.06829, + "grad_norm": 0.5678645003238811, + "learning_rate": 0.003, + "loss": 4.114, + "step": 6829 + }, + { + "epoch": 0.0683, + "grad_norm": 0.5736945535959624, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 6830 + }, + { + "epoch": 0.06831, + "grad_norm": 0.5766186753999193, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 6831 + }, + { + "epoch": 0.06832, + "grad_norm": 0.5412637917693169, + "learning_rate": 0.003, + "loss": 4.094, + "step": 6832 + }, + { + "epoch": 0.06833, + "grad_norm": 0.5354756199374949, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 6833 + }, + { + "epoch": 0.06834, + "grad_norm": 0.55758586869952, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 6834 + }, + { + "epoch": 0.06835, + "grad_norm": 0.5851752358292377, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 6835 + }, + { + "epoch": 0.06836, + "grad_norm": 0.5357594576872605, + "learning_rate": 0.003, + "loss": 4.1344, + "step": 6836 + }, + { + "epoch": 0.06837, + "grad_norm": 0.5493992586473802, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 6837 + }, + { + "epoch": 0.06838, + "grad_norm": 0.6237990486728586, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 6838 + }, + { + "epoch": 0.06839, + "grad_norm": 0.9418401714608854, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 6839 + }, + { + "epoch": 0.0684, + "grad_norm": 1.3064642474920947, + "learning_rate": 0.003, + "loss": 4.1397, + "step": 6840 + }, + { + "epoch": 0.06841, + "grad_norm": 0.6597660684920605, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 6841 + }, + { + "epoch": 0.06842, + "grad_norm": 0.6489661413957194, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 6842 + }, + { + "epoch": 0.06843, + "grad_norm": 0.6773408231334266, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 6843 + }, + { + "epoch": 0.06844, + "grad_norm": 0.6589160490897324, + "learning_rate": 0.003, + "loss": 4.106, + "step": 6844 + }, + { + "epoch": 0.06845, + "grad_norm": 0.5891507948339562, + "learning_rate": 0.003, + "loss": 4.1228, + "step": 6845 + }, + { + "epoch": 0.06846, + "grad_norm": 0.5999658896363759, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 6846 + }, + { + "epoch": 0.06847, + "grad_norm": 0.5887881579441098, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 6847 + }, + { + "epoch": 0.06848, + "grad_norm": 0.5649925957595652, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 6848 + }, + { + "epoch": 0.06849, + "grad_norm": 0.5705197551609986, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 6849 + }, + { + "epoch": 0.0685, + "grad_norm": 0.5952753055541125, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 6850 + }, + { + "epoch": 0.06851, + "grad_norm": 0.6213756923220035, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 6851 + }, + { + "epoch": 0.06852, + "grad_norm": 0.6907364246047624, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 6852 + }, + { + "epoch": 0.06853, + "grad_norm": 0.6969902001664416, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 6853 + }, + { + "epoch": 0.06854, + "grad_norm": 0.662173631889165, + "learning_rate": 0.003, + "loss": 4.1239, + "step": 6854 + }, + { + "epoch": 0.06855, + "grad_norm": 0.7232071834064421, + "learning_rate": 0.003, + "loss": 4.1377, + "step": 6855 + }, + { + "epoch": 0.06856, + "grad_norm": 0.7279714519404217, + "learning_rate": 0.003, + "loss": 4.1325, + "step": 6856 + }, + { + "epoch": 0.06857, + "grad_norm": 0.7721813680083663, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 6857 + }, + { + "epoch": 0.06858, + "grad_norm": 0.7425927331139939, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 6858 + }, + { + "epoch": 0.06859, + "grad_norm": 0.6721982817507133, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 6859 + }, + { + "epoch": 0.0686, + "grad_norm": 0.5774934823673541, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 6860 + }, + { + "epoch": 0.06861, + "grad_norm": 0.6138735149136986, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 6861 + }, + { + "epoch": 0.06862, + "grad_norm": 0.6955887372108659, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 6862 + }, + { + "epoch": 0.06863, + "grad_norm": 0.7889995228865956, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 6863 + }, + { + "epoch": 0.06864, + "grad_norm": 0.8319268473846568, + "learning_rate": 0.003, + "loss": 4.1189, + "step": 6864 + }, + { + "epoch": 0.06865, + "grad_norm": 0.7332088452187622, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 6865 + }, + { + "epoch": 0.06866, + "grad_norm": 0.7569264658828823, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 6866 + }, + { + "epoch": 0.06867, + "grad_norm": 0.7482157424829708, + "learning_rate": 0.003, + "loss": 4.1359, + "step": 6867 + }, + { + "epoch": 0.06868, + "grad_norm": 0.8067432575353655, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 6868 + }, + { + "epoch": 0.06869, + "grad_norm": 0.795724395615385, + "learning_rate": 0.003, + "loss": 4.127, + "step": 6869 + }, + { + "epoch": 0.0687, + "grad_norm": 0.7990167361942389, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 6870 + }, + { + "epoch": 0.06871, + "grad_norm": 0.751687574887691, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 6871 + }, + { + "epoch": 0.06872, + "grad_norm": 0.7941670622571882, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 6872 + }, + { + "epoch": 0.06873, + "grad_norm": 0.818073654767055, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 6873 + }, + { + "epoch": 0.06874, + "grad_norm": 0.8810084565313016, + "learning_rate": 0.003, + "loss": 4.13, + "step": 6874 + }, + { + "epoch": 0.06875, + "grad_norm": 0.9712435694596839, + "learning_rate": 0.003, + "loss": 4.1379, + "step": 6875 + }, + { + "epoch": 0.06876, + "grad_norm": 1.1759386697440901, + "learning_rate": 0.003, + "loss": 4.1556, + "step": 6876 + }, + { + "epoch": 0.06877, + "grad_norm": 0.8914085065829237, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 6877 + }, + { + "epoch": 0.06878, + "grad_norm": 0.6941362711827244, + "learning_rate": 0.003, + "loss": 4.102, + "step": 6878 + }, + { + "epoch": 0.06879, + "grad_norm": 0.6928015779389758, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 6879 + }, + { + "epoch": 0.0688, + "grad_norm": 0.738782255552787, + "learning_rate": 0.003, + "loss": 4.1234, + "step": 6880 + }, + { + "epoch": 0.06881, + "grad_norm": 0.6910853720919746, + "learning_rate": 0.003, + "loss": 4.112, + "step": 6881 + }, + { + "epoch": 0.06882, + "grad_norm": 0.7254038162214872, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 6882 + }, + { + "epoch": 0.06883, + "grad_norm": 0.8207902946108598, + "learning_rate": 0.003, + "loss": 4.1434, + "step": 6883 + }, + { + "epoch": 0.06884, + "grad_norm": 0.7716553726102112, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 6884 + }, + { + "epoch": 0.06885, + "grad_norm": 0.6958687421650827, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 6885 + }, + { + "epoch": 0.06886, + "grad_norm": 0.5821526100113974, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 6886 + }, + { + "epoch": 0.06887, + "grad_norm": 0.5981430384947571, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 6887 + }, + { + "epoch": 0.06888, + "grad_norm": 0.643772790056028, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 6888 + }, + { + "epoch": 0.06889, + "grad_norm": 0.6944406405800507, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 6889 + }, + { + "epoch": 0.0689, + "grad_norm": 0.6385881084418807, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 6890 + }, + { + "epoch": 0.06891, + "grad_norm": 0.6086144164229688, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 6891 + }, + { + "epoch": 0.06892, + "grad_norm": 0.6012480896266729, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 6892 + }, + { + "epoch": 0.06893, + "grad_norm": 0.561071573044732, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 6893 + }, + { + "epoch": 0.06894, + "grad_norm": 0.48860461588730103, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 6894 + }, + { + "epoch": 0.06895, + "grad_norm": 0.5246990023618513, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 6895 + }, + { + "epoch": 0.06896, + "grad_norm": 0.5887964150643885, + "learning_rate": 0.003, + "loss": 4.088, + "step": 6896 + }, + { + "epoch": 0.06897, + "grad_norm": 0.5241278786378765, + "learning_rate": 0.003, + "loss": 4.1284, + "step": 6897 + }, + { + "epoch": 0.06898, + "grad_norm": 0.5243694688992354, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 6898 + }, + { + "epoch": 0.06899, + "grad_norm": 0.5729571128070601, + "learning_rate": 0.003, + "loss": 4.1318, + "step": 6899 + }, + { + "epoch": 0.069, + "grad_norm": 0.5895390705006303, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 6900 + }, + { + "epoch": 0.06901, + "grad_norm": 0.6952354448422888, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 6901 + }, + { + "epoch": 0.06902, + "grad_norm": 0.6648257134889515, + "learning_rate": 0.003, + "loss": 4.086, + "step": 6902 + }, + { + "epoch": 0.06903, + "grad_norm": 0.6781063401244306, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 6903 + }, + { + "epoch": 0.06904, + "grad_norm": 0.6750907763884256, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 6904 + }, + { + "epoch": 0.06905, + "grad_norm": 0.701961664699957, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 6905 + }, + { + "epoch": 0.06906, + "grad_norm": 0.6469621356429142, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 6906 + }, + { + "epoch": 0.06907, + "grad_norm": 0.5765290038257871, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 6907 + }, + { + "epoch": 0.06908, + "grad_norm": 0.6197899283242572, + "learning_rate": 0.003, + "loss": 4.075, + "step": 6908 + }, + { + "epoch": 0.06909, + "grad_norm": 0.6933557799843862, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 6909 + }, + { + "epoch": 0.0691, + "grad_norm": 0.7562290219169062, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 6910 + }, + { + "epoch": 0.06911, + "grad_norm": 0.7734263332293689, + "learning_rate": 0.003, + "loss": 4.1301, + "step": 6911 + }, + { + "epoch": 0.06912, + "grad_norm": 0.8992983547552829, + "learning_rate": 0.003, + "loss": 4.1146, + "step": 6912 + }, + { + "epoch": 0.06913, + "grad_norm": 1.0208555338096066, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 6913 + }, + { + "epoch": 0.06914, + "grad_norm": 0.9933731989818703, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 6914 + }, + { + "epoch": 0.06915, + "grad_norm": 1.142744843357704, + "learning_rate": 0.003, + "loss": 4.1688, + "step": 6915 + }, + { + "epoch": 0.06916, + "grad_norm": 0.671739611249756, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 6916 + }, + { + "epoch": 0.06917, + "grad_norm": 0.7866141531687029, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 6917 + }, + { + "epoch": 0.06918, + "grad_norm": 0.9611567239474412, + "learning_rate": 0.003, + "loss": 4.1474, + "step": 6918 + }, + { + "epoch": 0.06919, + "grad_norm": 0.9336705733194112, + "learning_rate": 0.003, + "loss": 4.1458, + "step": 6919 + }, + { + "epoch": 0.0692, + "grad_norm": 0.8550272945447894, + "learning_rate": 0.003, + "loss": 4.1329, + "step": 6920 + }, + { + "epoch": 0.06921, + "grad_norm": 0.8255769935938679, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 6921 + }, + { + "epoch": 0.06922, + "grad_norm": 0.7887735703255148, + "learning_rate": 0.003, + "loss": 4.1308, + "step": 6922 + }, + { + "epoch": 0.06923, + "grad_norm": 0.7586591684231498, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 6923 + }, + { + "epoch": 0.06924, + "grad_norm": 0.833512973215651, + "learning_rate": 0.003, + "loss": 4.1555, + "step": 6924 + }, + { + "epoch": 0.06925, + "grad_norm": 0.8413623073932563, + "learning_rate": 0.003, + "loss": 4.1537, + "step": 6925 + }, + { + "epoch": 0.06926, + "grad_norm": 0.7024551972837352, + "learning_rate": 0.003, + "loss": 4.1405, + "step": 6926 + }, + { + "epoch": 0.06927, + "grad_norm": 0.5829899478703182, + "learning_rate": 0.003, + "loss": 4.1294, + "step": 6927 + }, + { + "epoch": 0.06928, + "grad_norm": 0.5794264624289892, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 6928 + }, + { + "epoch": 0.06929, + "grad_norm": 0.5549181816114577, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 6929 + }, + { + "epoch": 0.0693, + "grad_norm": 0.5430801494911696, + "learning_rate": 0.003, + "loss": 4.1566, + "step": 6930 + }, + { + "epoch": 0.06931, + "grad_norm": 0.47027424052399663, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 6931 + }, + { + "epoch": 0.06932, + "grad_norm": 0.4173009048479236, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 6932 + }, + { + "epoch": 0.06933, + "grad_norm": 0.40150485804104163, + "learning_rate": 0.003, + "loss": 4.1284, + "step": 6933 + }, + { + "epoch": 0.06934, + "grad_norm": 0.3862168071563836, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 6934 + }, + { + "epoch": 0.06935, + "grad_norm": 0.446771899060793, + "learning_rate": 0.003, + "loss": 4.1147, + "step": 6935 + }, + { + "epoch": 0.06936, + "grad_norm": 0.5004501018374794, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 6936 + }, + { + "epoch": 0.06937, + "grad_norm": 0.5488151795943027, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 6937 + }, + { + "epoch": 0.06938, + "grad_norm": 0.5770212398497512, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 6938 + }, + { + "epoch": 0.06939, + "grad_norm": 0.558653999723117, + "learning_rate": 0.003, + "loss": 4.104, + "step": 6939 + }, + { + "epoch": 0.0694, + "grad_norm": 0.5466069373357318, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 6940 + }, + { + "epoch": 0.06941, + "grad_norm": 0.49853177456508263, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 6941 + }, + { + "epoch": 0.06942, + "grad_norm": 0.506244292367137, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 6942 + }, + { + "epoch": 0.06943, + "grad_norm": 0.49638439608831303, + "learning_rate": 0.003, + "loss": 4.1147, + "step": 6943 + }, + { + "epoch": 0.06944, + "grad_norm": 0.5489964428142321, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 6944 + }, + { + "epoch": 0.06945, + "grad_norm": 0.7004086562916291, + "learning_rate": 0.003, + "loss": 4.1294, + "step": 6945 + }, + { + "epoch": 0.06946, + "grad_norm": 0.8618988339111132, + "learning_rate": 0.003, + "loss": 4.1138, + "step": 6946 + }, + { + "epoch": 0.06947, + "grad_norm": 0.8898968651463349, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 6947 + }, + { + "epoch": 0.06948, + "grad_norm": 0.8891675654883304, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 6948 + }, + { + "epoch": 0.06949, + "grad_norm": 0.8415925076980938, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 6949 + }, + { + "epoch": 0.0695, + "grad_norm": 0.8358314389372723, + "learning_rate": 0.003, + "loss": 4.106, + "step": 6950 + }, + { + "epoch": 0.06951, + "grad_norm": 0.8991779236158414, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 6951 + }, + { + "epoch": 0.06952, + "grad_norm": 0.9143145994663924, + "learning_rate": 0.003, + "loss": 4.1467, + "step": 6952 + }, + { + "epoch": 0.06953, + "grad_norm": 0.8582198172305813, + "learning_rate": 0.003, + "loss": 4.1448, + "step": 6953 + }, + { + "epoch": 0.06954, + "grad_norm": 0.8532675006476708, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 6954 + }, + { + "epoch": 0.06955, + "grad_norm": 0.8361880964205014, + "learning_rate": 0.003, + "loss": 4.1358, + "step": 6955 + }, + { + "epoch": 0.06956, + "grad_norm": 0.7160992886820927, + "learning_rate": 0.003, + "loss": 4.1352, + "step": 6956 + }, + { + "epoch": 0.06957, + "grad_norm": 0.6572004040022867, + "learning_rate": 0.003, + "loss": 4.112, + "step": 6957 + }, + { + "epoch": 0.06958, + "grad_norm": 0.6687661358163699, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 6958 + }, + { + "epoch": 0.06959, + "grad_norm": 0.6817550869048531, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 6959 + }, + { + "epoch": 0.0696, + "grad_norm": 0.7535821582695273, + "learning_rate": 0.003, + "loss": 4.1349, + "step": 6960 + }, + { + "epoch": 0.06961, + "grad_norm": 0.8825707617170178, + "learning_rate": 0.003, + "loss": 4.138, + "step": 6961 + }, + { + "epoch": 0.06962, + "grad_norm": 0.8710338520388163, + "learning_rate": 0.003, + "loss": 4.1283, + "step": 6962 + }, + { + "epoch": 0.06963, + "grad_norm": 0.816650176387816, + "learning_rate": 0.003, + "loss": 4.1337, + "step": 6963 + }, + { + "epoch": 0.06964, + "grad_norm": 0.7260316746191819, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 6964 + }, + { + "epoch": 0.06965, + "grad_norm": 0.6648041497655378, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 6965 + }, + { + "epoch": 0.06966, + "grad_norm": 0.6784510161564018, + "learning_rate": 0.003, + "loss": 4.131, + "step": 6966 + }, + { + "epoch": 0.06967, + "grad_norm": 0.6452607728019111, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 6967 + }, + { + "epoch": 0.06968, + "grad_norm": 0.6001265248194674, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 6968 + }, + { + "epoch": 0.06969, + "grad_norm": 0.5934642265206468, + "learning_rate": 0.003, + "loss": 4.1301, + "step": 6969 + }, + { + "epoch": 0.0697, + "grad_norm": 0.5831037045271329, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 6970 + }, + { + "epoch": 0.06971, + "grad_norm": 0.6199075375261113, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 6971 + }, + { + "epoch": 0.06972, + "grad_norm": 0.6436556062187045, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 6972 + }, + { + "epoch": 0.06973, + "grad_norm": 0.5645264329417377, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 6973 + }, + { + "epoch": 0.06974, + "grad_norm": 0.6421089807938115, + "learning_rate": 0.003, + "loss": 4.1055, + "step": 6974 + }, + { + "epoch": 0.06975, + "grad_norm": 0.7540210531000154, + "learning_rate": 0.003, + "loss": 4.1165, + "step": 6975 + }, + { + "epoch": 0.06976, + "grad_norm": 0.7638332625980192, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 6976 + }, + { + "epoch": 0.06977, + "grad_norm": 0.8335885386349529, + "learning_rate": 0.003, + "loss": 4.1551, + "step": 6977 + }, + { + "epoch": 0.06978, + "grad_norm": 0.8006226145225701, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 6978 + }, + { + "epoch": 0.06979, + "grad_norm": 0.7348797862910301, + "learning_rate": 0.003, + "loss": 4.1325, + "step": 6979 + }, + { + "epoch": 0.0698, + "grad_norm": 0.8010250875166668, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 6980 + }, + { + "epoch": 0.06981, + "grad_norm": 0.7496972705067594, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 6981 + }, + { + "epoch": 0.06982, + "grad_norm": 0.7126647429947338, + "learning_rate": 0.003, + "loss": 4.105, + "step": 6982 + }, + { + "epoch": 0.06983, + "grad_norm": 0.676360984622482, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 6983 + }, + { + "epoch": 0.06984, + "grad_norm": 0.6167843107245039, + "learning_rate": 0.003, + "loss": 4.092, + "step": 6984 + }, + { + "epoch": 0.06985, + "grad_norm": 0.5226662640709087, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 6985 + }, + { + "epoch": 0.06986, + "grad_norm": 0.4870800595121638, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 6986 + }, + { + "epoch": 0.06987, + "grad_norm": 0.4732107972316138, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 6987 + }, + { + "epoch": 0.06988, + "grad_norm": 0.5060749910774011, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 6988 + }, + { + "epoch": 0.06989, + "grad_norm": 0.6140987263407802, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 6989 + }, + { + "epoch": 0.0699, + "grad_norm": 0.6629504376964789, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 6990 + }, + { + "epoch": 0.06991, + "grad_norm": 0.7586325106957013, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 6991 + }, + { + "epoch": 0.06992, + "grad_norm": 0.6309648702561338, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 6992 + }, + { + "epoch": 0.06993, + "grad_norm": 0.5315699899335162, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 6993 + }, + { + "epoch": 0.06994, + "grad_norm": 0.48772801302005414, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 6994 + }, + { + "epoch": 0.06995, + "grad_norm": 0.5131657989899862, + "learning_rate": 0.003, + "loss": 4.1132, + "step": 6995 + }, + { + "epoch": 0.06996, + "grad_norm": 0.5515157780044487, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 6996 + }, + { + "epoch": 0.06997, + "grad_norm": 0.5525209598406478, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 6997 + }, + { + "epoch": 0.06998, + "grad_norm": 0.479008437370693, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 6998 + }, + { + "epoch": 0.06999, + "grad_norm": 0.4505882666020481, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 6999 + }, + { + "epoch": 0.07, + "grad_norm": 0.520401586803475, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 7000 + }, + { + "epoch": 0.07001, + "grad_norm": 0.5963833247005839, + "learning_rate": 0.003, + "loss": 4.1418, + "step": 7001 + }, + { + "epoch": 0.07002, + "grad_norm": 0.7816965535130395, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 7002 + }, + { + "epoch": 0.07003, + "grad_norm": 0.8103989079782393, + "learning_rate": 0.003, + "loss": 4.09, + "step": 7003 + }, + { + "epoch": 0.07004, + "grad_norm": 0.7881315831755371, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 7004 + }, + { + "epoch": 0.07005, + "grad_norm": 0.8308231793794756, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 7005 + }, + { + "epoch": 0.07006, + "grad_norm": 0.8109007968000713, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 7006 + }, + { + "epoch": 0.07007, + "grad_norm": 0.6344467493229883, + "learning_rate": 0.003, + "loss": 4.1017, + "step": 7007 + }, + { + "epoch": 0.07008, + "grad_norm": 0.6334215629519069, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 7008 + }, + { + "epoch": 0.07009, + "grad_norm": 0.6472001020552638, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 7009 + }, + { + "epoch": 0.0701, + "grad_norm": 0.6243314055174884, + "learning_rate": 0.003, + "loss": 4.09, + "step": 7010 + }, + { + "epoch": 0.07011, + "grad_norm": 0.6932842423649617, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 7011 + }, + { + "epoch": 0.07012, + "grad_norm": 0.8521696913542735, + "learning_rate": 0.003, + "loss": 4.1263, + "step": 7012 + }, + { + "epoch": 0.07013, + "grad_norm": 1.0813751987049736, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 7013 + }, + { + "epoch": 0.07014, + "grad_norm": 0.8685320890891782, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 7014 + }, + { + "epoch": 0.07015, + "grad_norm": 0.714165389488949, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 7015 + }, + { + "epoch": 0.07016, + "grad_norm": 0.6240822477720798, + "learning_rate": 0.003, + "loss": 4.1017, + "step": 7016 + }, + { + "epoch": 0.07017, + "grad_norm": 0.6300786532998714, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 7017 + }, + { + "epoch": 0.07018, + "grad_norm": 0.6076108348038608, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 7018 + }, + { + "epoch": 0.07019, + "grad_norm": 0.6908646612154652, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 7019 + }, + { + "epoch": 0.0702, + "grad_norm": 0.6690787244309745, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 7020 + }, + { + "epoch": 0.07021, + "grad_norm": 0.6214099299934482, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 7021 + }, + { + "epoch": 0.07022, + "grad_norm": 0.5406773132983848, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 7022 + }, + { + "epoch": 0.07023, + "grad_norm": 0.5564313613572602, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 7023 + }, + { + "epoch": 0.07024, + "grad_norm": 0.565839780513049, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 7024 + }, + { + "epoch": 0.07025, + "grad_norm": 0.6209168576950462, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 7025 + }, + { + "epoch": 0.07026, + "grad_norm": 0.6750589310489371, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 7026 + }, + { + "epoch": 0.07027, + "grad_norm": 0.683143198082187, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 7027 + }, + { + "epoch": 0.07028, + "grad_norm": 0.6337869888733688, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 7028 + }, + { + "epoch": 0.07029, + "grad_norm": 0.6399163304203913, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 7029 + }, + { + "epoch": 0.0703, + "grad_norm": 0.6748721551610196, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 7030 + }, + { + "epoch": 0.07031, + "grad_norm": 0.7321679893744895, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 7031 + }, + { + "epoch": 0.07032, + "grad_norm": 0.8299726001491252, + "learning_rate": 0.003, + "loss": 4.1179, + "step": 7032 + }, + { + "epoch": 0.07033, + "grad_norm": 0.9060063098737187, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 7033 + }, + { + "epoch": 0.07034, + "grad_norm": 0.9522354378897846, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 7034 + }, + { + "epoch": 0.07035, + "grad_norm": 0.972269290200196, + "learning_rate": 0.003, + "loss": 4.128, + "step": 7035 + }, + { + "epoch": 0.07036, + "grad_norm": 0.924567971293714, + "learning_rate": 0.003, + "loss": 4.1179, + "step": 7036 + }, + { + "epoch": 0.07037, + "grad_norm": 0.8651360365392121, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 7037 + }, + { + "epoch": 0.07038, + "grad_norm": 0.8060967376427206, + "learning_rate": 0.003, + "loss": 4.1262, + "step": 7038 + }, + { + "epoch": 0.07039, + "grad_norm": 0.7677552044419784, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 7039 + }, + { + "epoch": 0.0704, + "grad_norm": 0.7934246766415343, + "learning_rate": 0.003, + "loss": 4.1431, + "step": 7040 + }, + { + "epoch": 0.07041, + "grad_norm": 0.7702031563514626, + "learning_rate": 0.003, + "loss": 4.116, + "step": 7041 + }, + { + "epoch": 0.07042, + "grad_norm": 0.8294171357357263, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 7042 + }, + { + "epoch": 0.07043, + "grad_norm": 0.8699931370635664, + "learning_rate": 0.003, + "loss": 4.1423, + "step": 7043 + }, + { + "epoch": 0.07044, + "grad_norm": 0.8662992135149028, + "learning_rate": 0.003, + "loss": 4.1445, + "step": 7044 + }, + { + "epoch": 0.07045, + "grad_norm": 0.8556015774986996, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 7045 + }, + { + "epoch": 0.07046, + "grad_norm": 0.8195747843665967, + "learning_rate": 0.003, + "loss": 4.1359, + "step": 7046 + }, + { + "epoch": 0.07047, + "grad_norm": 0.7213877816923308, + "learning_rate": 0.003, + "loss": 4.1305, + "step": 7047 + }, + { + "epoch": 0.07048, + "grad_norm": 0.6149100905732247, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 7048 + }, + { + "epoch": 0.07049, + "grad_norm": 0.6732224266829641, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 7049 + }, + { + "epoch": 0.0705, + "grad_norm": 0.7014634218429036, + "learning_rate": 0.003, + "loss": 4.1362, + "step": 7050 + }, + { + "epoch": 0.07051, + "grad_norm": 0.6522225830743739, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 7051 + }, + { + "epoch": 0.07052, + "grad_norm": 0.6500823083801979, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 7052 + }, + { + "epoch": 0.07053, + "grad_norm": 0.6368059007709821, + "learning_rate": 0.003, + "loss": 4.103, + "step": 7053 + }, + { + "epoch": 0.07054, + "grad_norm": 0.6937824763862059, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 7054 + }, + { + "epoch": 0.07055, + "grad_norm": 0.6678203074975709, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 7055 + }, + { + "epoch": 0.07056, + "grad_norm": 0.5999332348304974, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 7056 + }, + { + "epoch": 0.07057, + "grad_norm": 0.6207684817645435, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 7057 + }, + { + "epoch": 0.07058, + "grad_norm": 0.6334942997590186, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 7058 + }, + { + "epoch": 0.07059, + "grad_norm": 0.6248499550694564, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 7059 + }, + { + "epoch": 0.0706, + "grad_norm": 0.6690971751004321, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 7060 + }, + { + "epoch": 0.07061, + "grad_norm": 0.6681006456986236, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 7061 + }, + { + "epoch": 0.07062, + "grad_norm": 0.5873099188423571, + "learning_rate": 0.003, + "loss": 4.133, + "step": 7062 + }, + { + "epoch": 0.07063, + "grad_norm": 0.5660315602781396, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 7063 + }, + { + "epoch": 0.07064, + "grad_norm": 0.5354212695010325, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 7064 + }, + { + "epoch": 0.07065, + "grad_norm": 0.5541650096697861, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 7065 + }, + { + "epoch": 0.07066, + "grad_norm": 0.5737529936227163, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 7066 + }, + { + "epoch": 0.07067, + "grad_norm": 0.6443507476053769, + "learning_rate": 0.003, + "loss": 4.1481, + "step": 7067 + }, + { + "epoch": 0.07068, + "grad_norm": 0.7079051560187819, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 7068 + }, + { + "epoch": 0.07069, + "grad_norm": 0.5681815389061051, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 7069 + }, + { + "epoch": 0.0707, + "grad_norm": 0.5054781241764719, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 7070 + }, + { + "epoch": 0.07071, + "grad_norm": 0.5363029539486839, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 7071 + }, + { + "epoch": 0.07072, + "grad_norm": 0.5662713105012626, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 7072 + }, + { + "epoch": 0.07073, + "grad_norm": 0.5886362212149604, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 7073 + }, + { + "epoch": 0.07074, + "grad_norm": 0.5383952438154733, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 7074 + }, + { + "epoch": 0.07075, + "grad_norm": 0.49108475293469883, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 7075 + }, + { + "epoch": 0.07076, + "grad_norm": 0.46684864123512826, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 7076 + }, + { + "epoch": 0.07077, + "grad_norm": 0.5465447445092506, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 7077 + }, + { + "epoch": 0.07078, + "grad_norm": 0.6015521724166992, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 7078 + }, + { + "epoch": 0.07079, + "grad_norm": 0.85890424119227, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 7079 + }, + { + "epoch": 0.0708, + "grad_norm": 1.282741314578649, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 7080 + }, + { + "epoch": 0.07081, + "grad_norm": 0.6908289270870185, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 7081 + }, + { + "epoch": 0.07082, + "grad_norm": 0.6179688734599202, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 7082 + }, + { + "epoch": 0.07083, + "grad_norm": 0.7995600194026101, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 7083 + }, + { + "epoch": 0.07084, + "grad_norm": 0.8857457001103319, + "learning_rate": 0.003, + "loss": 4.113, + "step": 7084 + }, + { + "epoch": 0.07085, + "grad_norm": 0.9143259065826967, + "learning_rate": 0.003, + "loss": 4.1363, + "step": 7085 + }, + { + "epoch": 0.07086, + "grad_norm": 0.9810070731706124, + "learning_rate": 0.003, + "loss": 4.12, + "step": 7086 + }, + { + "epoch": 0.07087, + "grad_norm": 0.9123121727972462, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 7087 + }, + { + "epoch": 0.07088, + "grad_norm": 0.8624587235237147, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 7088 + }, + { + "epoch": 0.07089, + "grad_norm": 0.7897069970288599, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 7089 + }, + { + "epoch": 0.0709, + "grad_norm": 0.6952949093057026, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 7090 + }, + { + "epoch": 0.07091, + "grad_norm": 0.6864230472842601, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 7091 + }, + { + "epoch": 0.07092, + "grad_norm": 0.6649365850217969, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 7092 + }, + { + "epoch": 0.07093, + "grad_norm": 0.6841019927195824, + "learning_rate": 0.003, + "loss": 4.1305, + "step": 7093 + }, + { + "epoch": 0.07094, + "grad_norm": 0.6127976340559059, + "learning_rate": 0.003, + "loss": 4.1468, + "step": 7094 + }, + { + "epoch": 0.07095, + "grad_norm": 0.616886834283065, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 7095 + }, + { + "epoch": 0.07096, + "grad_norm": 0.6971966421797507, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 7096 + }, + { + "epoch": 0.07097, + "grad_norm": 0.6710301285847851, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 7097 + }, + { + "epoch": 0.07098, + "grad_norm": 0.61562136549296, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 7098 + }, + { + "epoch": 0.07099, + "grad_norm": 0.6419148521913456, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 7099 + }, + { + "epoch": 0.071, + "grad_norm": 0.6323659270253572, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 7100 + }, + { + "epoch": 0.07101, + "grad_norm": 0.6790975940780517, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 7101 + }, + { + "epoch": 0.07102, + "grad_norm": 0.7012073427223774, + "learning_rate": 0.003, + "loss": 4.112, + "step": 7102 + }, + { + "epoch": 0.07103, + "grad_norm": 0.8170376783966959, + "learning_rate": 0.003, + "loss": 4.1282, + "step": 7103 + }, + { + "epoch": 0.07104, + "grad_norm": 0.9864196954449765, + "learning_rate": 0.003, + "loss": 4.1317, + "step": 7104 + }, + { + "epoch": 0.07105, + "grad_norm": 0.9605522303404178, + "learning_rate": 0.003, + "loss": 4.123, + "step": 7105 + }, + { + "epoch": 0.07106, + "grad_norm": 0.8886566459127305, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 7106 + }, + { + "epoch": 0.07107, + "grad_norm": 0.8973256822058434, + "learning_rate": 0.003, + "loss": 4.1364, + "step": 7107 + }, + { + "epoch": 0.07108, + "grad_norm": 0.8957355256957436, + "learning_rate": 0.003, + "loss": 4.1396, + "step": 7108 + }, + { + "epoch": 0.07109, + "grad_norm": 0.9603444798838409, + "learning_rate": 0.003, + "loss": 4.1331, + "step": 7109 + }, + { + "epoch": 0.0711, + "grad_norm": 1.071506569773605, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 7110 + }, + { + "epoch": 0.07111, + "grad_norm": 0.8800294655537059, + "learning_rate": 0.003, + "loss": 4.1255, + "step": 7111 + }, + { + "epoch": 0.07112, + "grad_norm": 0.651480864700506, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 7112 + }, + { + "epoch": 0.07113, + "grad_norm": 0.5845950738429475, + "learning_rate": 0.003, + "loss": 4.1496, + "step": 7113 + }, + { + "epoch": 0.07114, + "grad_norm": 0.5822849601468679, + "learning_rate": 0.003, + "loss": 4.1284, + "step": 7114 + }, + { + "epoch": 0.07115, + "grad_norm": 0.5474586592685721, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 7115 + }, + { + "epoch": 0.07116, + "grad_norm": 0.5187520875086112, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 7116 + }, + { + "epoch": 0.07117, + "grad_norm": 0.4394900872650167, + "learning_rate": 0.003, + "loss": 4.1249, + "step": 7117 + }, + { + "epoch": 0.07118, + "grad_norm": 0.3463005557935632, + "learning_rate": 0.003, + "loss": 4.1263, + "step": 7118 + }, + { + "epoch": 0.07119, + "grad_norm": 0.3752207233130492, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 7119 + }, + { + "epoch": 0.0712, + "grad_norm": 0.4024093721623634, + "learning_rate": 0.003, + "loss": 4.102, + "step": 7120 + }, + { + "epoch": 0.07121, + "grad_norm": 0.4320668079849204, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 7121 + }, + { + "epoch": 0.07122, + "grad_norm": 0.43814596025832847, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 7122 + }, + { + "epoch": 0.07123, + "grad_norm": 0.4660611414965408, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 7123 + }, + { + "epoch": 0.07124, + "grad_norm": 0.4926215620935902, + "learning_rate": 0.003, + "loss": 4.083, + "step": 7124 + }, + { + "epoch": 0.07125, + "grad_norm": 0.4996717729986805, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 7125 + }, + { + "epoch": 0.07126, + "grad_norm": 0.5425463804440391, + "learning_rate": 0.003, + "loss": 4.1229, + "step": 7126 + }, + { + "epoch": 0.07127, + "grad_norm": 0.6028489984656705, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 7127 + }, + { + "epoch": 0.07128, + "grad_norm": 0.6414630429602017, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 7128 + }, + { + "epoch": 0.07129, + "grad_norm": 0.6306384667736967, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 7129 + }, + { + "epoch": 0.0713, + "grad_norm": 0.6762556869839564, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 7130 + }, + { + "epoch": 0.07131, + "grad_norm": 0.782955807754172, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 7131 + }, + { + "epoch": 0.07132, + "grad_norm": 0.8150492320482005, + "learning_rate": 0.003, + "loss": 4.1329, + "step": 7132 + }, + { + "epoch": 0.07133, + "grad_norm": 0.7371939952383894, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 7133 + }, + { + "epoch": 0.07134, + "grad_norm": 0.7676831192441333, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 7134 + }, + { + "epoch": 0.07135, + "grad_norm": 0.7690673723760045, + "learning_rate": 0.003, + "loss": 4.11, + "step": 7135 + }, + { + "epoch": 0.07136, + "grad_norm": 0.7766203756298646, + "learning_rate": 0.003, + "loss": 4.1474, + "step": 7136 + }, + { + "epoch": 0.07137, + "grad_norm": 0.807723913043927, + "learning_rate": 0.003, + "loss": 4.1121, + "step": 7137 + }, + { + "epoch": 0.07138, + "grad_norm": 0.7120609852111736, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 7138 + }, + { + "epoch": 0.07139, + "grad_norm": 0.6409244184377964, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 7139 + }, + { + "epoch": 0.0714, + "grad_norm": 0.6983765449622829, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 7140 + }, + { + "epoch": 0.07141, + "grad_norm": 0.6749975641693673, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 7141 + }, + { + "epoch": 0.07142, + "grad_norm": 0.7443875679801981, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 7142 + }, + { + "epoch": 0.07143, + "grad_norm": 0.8036162553734473, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 7143 + }, + { + "epoch": 0.07144, + "grad_norm": 0.8238780582102404, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 7144 + }, + { + "epoch": 0.07145, + "grad_norm": 0.7818657301386684, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 7145 + }, + { + "epoch": 0.07146, + "grad_norm": 0.589772103883361, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 7146 + }, + { + "epoch": 0.07147, + "grad_norm": 0.5061896394583955, + "learning_rate": 0.003, + "loss": 4.1104, + "step": 7147 + }, + { + "epoch": 0.07148, + "grad_norm": 0.5563222484565196, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 7148 + }, + { + "epoch": 0.07149, + "grad_norm": 0.6306367666731154, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 7149 + }, + { + "epoch": 0.0715, + "grad_norm": 0.6045730081269641, + "learning_rate": 0.003, + "loss": 4.1017, + "step": 7150 + }, + { + "epoch": 0.07151, + "grad_norm": 0.5783897582471499, + "learning_rate": 0.003, + "loss": 4.101, + "step": 7151 + }, + { + "epoch": 0.07152, + "grad_norm": 0.5932667743048016, + "learning_rate": 0.003, + "loss": 4.1179, + "step": 7152 + }, + { + "epoch": 0.07153, + "grad_norm": 0.60473248923499, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 7153 + }, + { + "epoch": 0.07154, + "grad_norm": 0.6236732709390373, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 7154 + }, + { + "epoch": 0.07155, + "grad_norm": 0.7143317986692795, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 7155 + }, + { + "epoch": 0.07156, + "grad_norm": 0.806274009051711, + "learning_rate": 0.003, + "loss": 4.1147, + "step": 7156 + }, + { + "epoch": 0.07157, + "grad_norm": 0.8013068941242575, + "learning_rate": 0.003, + "loss": 4.105, + "step": 7157 + }, + { + "epoch": 0.07158, + "grad_norm": 0.8062400294331641, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 7158 + }, + { + "epoch": 0.07159, + "grad_norm": 0.901901589441292, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 7159 + }, + { + "epoch": 0.0716, + "grad_norm": 0.9668482710989087, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 7160 + }, + { + "epoch": 0.07161, + "grad_norm": 0.905245384275865, + "learning_rate": 0.003, + "loss": 4.1265, + "step": 7161 + }, + { + "epoch": 0.07162, + "grad_norm": 0.9280503905267976, + "learning_rate": 0.003, + "loss": 4.1358, + "step": 7162 + }, + { + "epoch": 0.07163, + "grad_norm": 0.9016130134973356, + "learning_rate": 0.003, + "loss": 4.126, + "step": 7163 + }, + { + "epoch": 0.07164, + "grad_norm": 0.8238069946322247, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 7164 + }, + { + "epoch": 0.07165, + "grad_norm": 0.9229684257907796, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 7165 + }, + { + "epoch": 0.07166, + "grad_norm": 0.8602566555422861, + "learning_rate": 0.003, + "loss": 4.121, + "step": 7166 + }, + { + "epoch": 0.07167, + "grad_norm": 0.7912994410141391, + "learning_rate": 0.003, + "loss": 4.1337, + "step": 7167 + }, + { + "epoch": 0.07168, + "grad_norm": 0.6645028530506353, + "learning_rate": 0.003, + "loss": 4.1462, + "step": 7168 + }, + { + "epoch": 0.07169, + "grad_norm": 0.6619833308914435, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 7169 + }, + { + "epoch": 0.0717, + "grad_norm": 0.6040314764385558, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 7170 + }, + { + "epoch": 0.07171, + "grad_norm": 0.6833108775226145, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 7171 + }, + { + "epoch": 0.07172, + "grad_norm": 0.7725987913634927, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 7172 + }, + { + "epoch": 0.07173, + "grad_norm": 0.7147302013430032, + "learning_rate": 0.003, + "loss": 4.1332, + "step": 7173 + }, + { + "epoch": 0.07174, + "grad_norm": 0.7180021296498525, + "learning_rate": 0.003, + "loss": 4.102, + "step": 7174 + }, + { + "epoch": 0.07175, + "grad_norm": 0.7457543221263961, + "learning_rate": 0.003, + "loss": 4.1433, + "step": 7175 + }, + { + "epoch": 0.07176, + "grad_norm": 0.6924904976395436, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 7176 + }, + { + "epoch": 0.07177, + "grad_norm": 0.5304650992546365, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 7177 + }, + { + "epoch": 0.07178, + "grad_norm": 0.5008427987279399, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 7178 + }, + { + "epoch": 0.07179, + "grad_norm": 0.4679265582031314, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 7179 + }, + { + "epoch": 0.0718, + "grad_norm": 0.43745452895304754, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 7180 + }, + { + "epoch": 0.07181, + "grad_norm": 0.45907357705620727, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 7181 + }, + { + "epoch": 0.07182, + "grad_norm": 0.49846152510871444, + "learning_rate": 0.003, + "loss": 4.1104, + "step": 7182 + }, + { + "epoch": 0.07183, + "grad_norm": 0.5359985839562037, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 7183 + }, + { + "epoch": 0.07184, + "grad_norm": 0.6563790510844429, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 7184 + }, + { + "epoch": 0.07185, + "grad_norm": 0.6720180437894167, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 7185 + }, + { + "epoch": 0.07186, + "grad_norm": 0.5587772637850045, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 7186 + }, + { + "epoch": 0.07187, + "grad_norm": 0.5401622815209526, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 7187 + }, + { + "epoch": 0.07188, + "grad_norm": 0.5467984886698021, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 7188 + }, + { + "epoch": 0.07189, + "grad_norm": 0.5726158372896095, + "learning_rate": 0.003, + "loss": 4.11, + "step": 7189 + }, + { + "epoch": 0.0719, + "grad_norm": 0.7251779521154695, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 7190 + }, + { + "epoch": 0.07191, + "grad_norm": 0.8469754288010317, + "learning_rate": 0.003, + "loss": 4.1261, + "step": 7191 + }, + { + "epoch": 0.07192, + "grad_norm": 0.9128848705622925, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 7192 + }, + { + "epoch": 0.07193, + "grad_norm": 0.8310324783740308, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 7193 + }, + { + "epoch": 0.07194, + "grad_norm": 0.6879939761909492, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 7194 + }, + { + "epoch": 0.07195, + "grad_norm": 0.5970160052475568, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 7195 + }, + { + "epoch": 0.07196, + "grad_norm": 0.5894178919524454, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 7196 + }, + { + "epoch": 0.07197, + "grad_norm": 0.5262073910189121, + "learning_rate": 0.003, + "loss": 4.114, + "step": 7197 + }, + { + "epoch": 0.07198, + "grad_norm": 0.5265600947376748, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 7198 + }, + { + "epoch": 0.07199, + "grad_norm": 0.541452819187127, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 7199 + }, + { + "epoch": 0.072, + "grad_norm": 0.6458006332096501, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 7200 + }, + { + "epoch": 0.07201, + "grad_norm": 0.8094071517689202, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 7201 + }, + { + "epoch": 0.07202, + "grad_norm": 0.7787699581874201, + "learning_rate": 0.003, + "loss": 4.1254, + "step": 7202 + }, + { + "epoch": 0.07203, + "grad_norm": 0.668461238530481, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 7203 + }, + { + "epoch": 0.07204, + "grad_norm": 0.6519472171530598, + "learning_rate": 0.003, + "loss": 4.1263, + "step": 7204 + }, + { + "epoch": 0.07205, + "grad_norm": 0.553477955345092, + "learning_rate": 0.003, + "loss": 4.1198, + "step": 7205 + }, + { + "epoch": 0.07206, + "grad_norm": 0.576969241800161, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 7206 + }, + { + "epoch": 0.07207, + "grad_norm": 0.6130153388967977, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 7207 + }, + { + "epoch": 0.07208, + "grad_norm": 0.6120567702128528, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 7208 + }, + { + "epoch": 0.07209, + "grad_norm": 0.5925349870606897, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 7209 + }, + { + "epoch": 0.0721, + "grad_norm": 0.5460934958885093, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 7210 + }, + { + "epoch": 0.07211, + "grad_norm": 0.5617487245207603, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 7211 + }, + { + "epoch": 0.07212, + "grad_norm": 0.5415277872929839, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 7212 + }, + { + "epoch": 0.07213, + "grad_norm": 0.5502607732589133, + "learning_rate": 0.003, + "loss": 4.1211, + "step": 7213 + }, + { + "epoch": 0.07214, + "grad_norm": 0.6409863289739841, + "learning_rate": 0.003, + "loss": 4.072, + "step": 7214 + }, + { + "epoch": 0.07215, + "grad_norm": 0.7570834086211581, + "learning_rate": 0.003, + "loss": 4.104, + "step": 7215 + }, + { + "epoch": 0.07216, + "grad_norm": 0.8132567876934826, + "learning_rate": 0.003, + "loss": 4.1165, + "step": 7216 + }, + { + "epoch": 0.07217, + "grad_norm": 0.9345387667787567, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 7217 + }, + { + "epoch": 0.07218, + "grad_norm": 1.0833907300880954, + "learning_rate": 0.003, + "loss": 4.127, + "step": 7218 + }, + { + "epoch": 0.07219, + "grad_norm": 0.9260287293122886, + "learning_rate": 0.003, + "loss": 4.101, + "step": 7219 + }, + { + "epoch": 0.0722, + "grad_norm": 1.038503139941199, + "learning_rate": 0.003, + "loss": 4.1388, + "step": 7220 + }, + { + "epoch": 0.07221, + "grad_norm": 0.9210746523660105, + "learning_rate": 0.003, + "loss": 4.1333, + "step": 7221 + }, + { + "epoch": 0.07222, + "grad_norm": 0.9858808371441633, + "learning_rate": 0.003, + "loss": 4.1283, + "step": 7222 + }, + { + "epoch": 0.07223, + "grad_norm": 0.9836284417759733, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 7223 + }, + { + "epoch": 0.07224, + "grad_norm": 0.985172160848877, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 7224 + }, + { + "epoch": 0.07225, + "grad_norm": 1.0092262989567715, + "learning_rate": 0.003, + "loss": 4.1431, + "step": 7225 + }, + { + "epoch": 0.07226, + "grad_norm": 1.0380410348021303, + "learning_rate": 0.003, + "loss": 4.1487, + "step": 7226 + }, + { + "epoch": 0.07227, + "grad_norm": 0.897496931177557, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 7227 + }, + { + "epoch": 0.07228, + "grad_norm": 0.8371397499199325, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 7228 + }, + { + "epoch": 0.07229, + "grad_norm": 0.68089602479955, + "learning_rate": 0.003, + "loss": 4.13, + "step": 7229 + }, + { + "epoch": 0.0723, + "grad_norm": 0.7440697649975428, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 7230 + }, + { + "epoch": 0.07231, + "grad_norm": 0.8546390577858282, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 7231 + }, + { + "epoch": 0.07232, + "grad_norm": 0.9911495928687625, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 7232 + }, + { + "epoch": 0.07233, + "grad_norm": 1.0402994652801865, + "learning_rate": 0.003, + "loss": 4.1494, + "step": 7233 + }, + { + "epoch": 0.07234, + "grad_norm": 0.7834142777445765, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 7234 + }, + { + "epoch": 0.07235, + "grad_norm": 0.79697526236287, + "learning_rate": 0.003, + "loss": 4.1635, + "step": 7235 + }, + { + "epoch": 0.07236, + "grad_norm": 0.8580397156652976, + "learning_rate": 0.003, + "loss": 4.1617, + "step": 7236 + }, + { + "epoch": 0.07237, + "grad_norm": 0.8311156048197793, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 7237 + }, + { + "epoch": 0.07238, + "grad_norm": 0.7018648916636654, + "learning_rate": 0.003, + "loss": 4.1421, + "step": 7238 + }, + { + "epoch": 0.07239, + "grad_norm": 0.6039008355156773, + "learning_rate": 0.003, + "loss": 4.1381, + "step": 7239 + }, + { + "epoch": 0.0724, + "grad_norm": 0.5161419422417888, + "learning_rate": 0.003, + "loss": 4.093, + "step": 7240 + }, + { + "epoch": 0.07241, + "grad_norm": 0.4874109183137781, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 7241 + }, + { + "epoch": 0.07242, + "grad_norm": 0.6217288908705515, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 7242 + }, + { + "epoch": 0.07243, + "grad_norm": 0.6488298417249205, + "learning_rate": 0.003, + "loss": 4.1269, + "step": 7243 + }, + { + "epoch": 0.07244, + "grad_norm": 0.6010925037781651, + "learning_rate": 0.003, + "loss": 4.1068, + "step": 7244 + }, + { + "epoch": 0.07245, + "grad_norm": 0.5324768587577942, + "learning_rate": 0.003, + "loss": 4.1053, + "step": 7245 + }, + { + "epoch": 0.07246, + "grad_norm": 0.49764740145745306, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 7246 + }, + { + "epoch": 0.07247, + "grad_norm": 0.4879501046529579, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 7247 + }, + { + "epoch": 0.07248, + "grad_norm": 0.44818383989320487, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 7248 + }, + { + "epoch": 0.07249, + "grad_norm": 0.4019767982051826, + "learning_rate": 0.003, + "loss": 4.072, + "step": 7249 + }, + { + "epoch": 0.0725, + "grad_norm": 0.37656345224727716, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 7250 + }, + { + "epoch": 0.07251, + "grad_norm": 0.39765646327210397, + "learning_rate": 0.003, + "loss": 4.1, + "step": 7251 + }, + { + "epoch": 0.07252, + "grad_norm": 0.4087153991886569, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 7252 + }, + { + "epoch": 0.07253, + "grad_norm": 0.4413249795378963, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 7253 + }, + { + "epoch": 0.07254, + "grad_norm": 0.49658754339700917, + "learning_rate": 0.003, + "loss": 4.117, + "step": 7254 + }, + { + "epoch": 0.07255, + "grad_norm": 0.6065287271274902, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 7255 + }, + { + "epoch": 0.07256, + "grad_norm": 0.7783935834491852, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 7256 + }, + { + "epoch": 0.07257, + "grad_norm": 0.8736962562673857, + "learning_rate": 0.003, + "loss": 4.11, + "step": 7257 + }, + { + "epoch": 0.07258, + "grad_norm": 0.7615763185801613, + "learning_rate": 0.003, + "loss": 4.076, + "step": 7258 + }, + { + "epoch": 0.07259, + "grad_norm": 0.6225198130561953, + "learning_rate": 0.003, + "loss": 4.102, + "step": 7259 + }, + { + "epoch": 0.0726, + "grad_norm": 0.6660945346455424, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 7260 + }, + { + "epoch": 0.07261, + "grad_norm": 0.6994963663146176, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 7261 + }, + { + "epoch": 0.07262, + "grad_norm": 0.7558699256531053, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 7262 + }, + { + "epoch": 0.07263, + "grad_norm": 0.8389060291693147, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 7263 + }, + { + "epoch": 0.07264, + "grad_norm": 0.9199074585746712, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 7264 + }, + { + "epoch": 0.07265, + "grad_norm": 0.9046795786100689, + "learning_rate": 0.003, + "loss": 4.1253, + "step": 7265 + }, + { + "epoch": 0.07266, + "grad_norm": 0.8984966553892594, + "learning_rate": 0.003, + "loss": 4.1462, + "step": 7266 + }, + { + "epoch": 0.07267, + "grad_norm": 0.8779492964239948, + "learning_rate": 0.003, + "loss": 4.1303, + "step": 7267 + }, + { + "epoch": 0.07268, + "grad_norm": 0.9458771160182521, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 7268 + }, + { + "epoch": 0.07269, + "grad_norm": 1.0021477216087307, + "learning_rate": 0.003, + "loss": 4.1328, + "step": 7269 + }, + { + "epoch": 0.0727, + "grad_norm": 1.0517837764583857, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 7270 + }, + { + "epoch": 0.07271, + "grad_norm": 0.7757058905695857, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 7271 + }, + { + "epoch": 0.07272, + "grad_norm": 0.6955324357433823, + "learning_rate": 0.003, + "loss": 4.1152, + "step": 7272 + }, + { + "epoch": 0.07273, + "grad_norm": 0.727966212131769, + "learning_rate": 0.003, + "loss": 4.1328, + "step": 7273 + }, + { + "epoch": 0.07274, + "grad_norm": 0.6894516072772396, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 7274 + }, + { + "epoch": 0.07275, + "grad_norm": 0.6466287667559749, + "learning_rate": 0.003, + "loss": 4.104, + "step": 7275 + }, + { + "epoch": 0.07276, + "grad_norm": 0.5806437052644794, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 7276 + }, + { + "epoch": 0.07277, + "grad_norm": 0.5491496894626376, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 7277 + }, + { + "epoch": 0.07278, + "grad_norm": 0.5807535736273424, + "learning_rate": 0.003, + "loss": 4.118, + "step": 7278 + }, + { + "epoch": 0.07279, + "grad_norm": 0.5810337347879152, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 7279 + }, + { + "epoch": 0.0728, + "grad_norm": 0.593882083760101, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 7280 + }, + { + "epoch": 0.07281, + "grad_norm": 0.5145677719892531, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 7281 + }, + { + "epoch": 0.07282, + "grad_norm": 0.5310247033119996, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 7282 + }, + { + "epoch": 0.07283, + "grad_norm": 0.5752791694559306, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 7283 + }, + { + "epoch": 0.07284, + "grad_norm": 0.6120341790052407, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 7284 + }, + { + "epoch": 0.07285, + "grad_norm": 0.7608771862935181, + "learning_rate": 0.003, + "loss": 4.1169, + "step": 7285 + }, + { + "epoch": 0.07286, + "grad_norm": 0.8881767066517879, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 7286 + }, + { + "epoch": 0.07287, + "grad_norm": 0.929008736891511, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 7287 + }, + { + "epoch": 0.07288, + "grad_norm": 0.7448189338903293, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 7288 + }, + { + "epoch": 0.07289, + "grad_norm": 0.6196893187777653, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 7289 + }, + { + "epoch": 0.0729, + "grad_norm": 0.6617627631781875, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 7290 + }, + { + "epoch": 0.07291, + "grad_norm": 0.5587104110160175, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 7291 + }, + { + "epoch": 0.07292, + "grad_norm": 0.5227848078058569, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 7292 + }, + { + "epoch": 0.07293, + "grad_norm": 0.4287989659731113, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 7293 + }, + { + "epoch": 0.07294, + "grad_norm": 0.39434672478777266, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 7294 + }, + { + "epoch": 0.07295, + "grad_norm": 0.35908636579555575, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 7295 + }, + { + "epoch": 0.07296, + "grad_norm": 0.3644756637139822, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 7296 + }, + { + "epoch": 0.07297, + "grad_norm": 0.3770629664236333, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 7297 + }, + { + "epoch": 0.07298, + "grad_norm": 0.48678207336741464, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 7298 + }, + { + "epoch": 0.07299, + "grad_norm": 0.5846170384424557, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 7299 + }, + { + "epoch": 0.073, + "grad_norm": 0.6595284998416144, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 7300 + }, + { + "epoch": 0.07301, + "grad_norm": 0.6211347463265787, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 7301 + }, + { + "epoch": 0.07302, + "grad_norm": 0.48150627502231, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 7302 + }, + { + "epoch": 0.07303, + "grad_norm": 0.5489707911727371, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 7303 + }, + { + "epoch": 0.07304, + "grad_norm": 0.5974812098818115, + "learning_rate": 0.003, + "loss": 4.085, + "step": 7304 + }, + { + "epoch": 0.07305, + "grad_norm": 0.7024670978584663, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 7305 + }, + { + "epoch": 0.07306, + "grad_norm": 0.7638676834438926, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 7306 + }, + { + "epoch": 0.07307, + "grad_norm": 0.8266201952032484, + "learning_rate": 0.003, + "loss": 4.1329, + "step": 7307 + }, + { + "epoch": 0.07308, + "grad_norm": 0.8513186722362418, + "learning_rate": 0.003, + "loss": 4.103, + "step": 7308 + }, + { + "epoch": 0.07309, + "grad_norm": 1.0336678643068749, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 7309 + }, + { + "epoch": 0.0731, + "grad_norm": 1.1463223547284502, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 7310 + }, + { + "epoch": 0.07311, + "grad_norm": 0.7757914479144218, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 7311 + }, + { + "epoch": 0.07312, + "grad_norm": 0.7055116339180564, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 7312 + }, + { + "epoch": 0.07313, + "grad_norm": 0.6548456604315757, + "learning_rate": 0.003, + "loss": 4.11, + "step": 7313 + }, + { + "epoch": 0.07314, + "grad_norm": 0.7320846700471259, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 7314 + }, + { + "epoch": 0.07315, + "grad_norm": 0.8166597914868653, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 7315 + }, + { + "epoch": 0.07316, + "grad_norm": 0.7595697268852177, + "learning_rate": 0.003, + "loss": 4.1299, + "step": 7316 + }, + { + "epoch": 0.07317, + "grad_norm": 0.6198775441688376, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 7317 + }, + { + "epoch": 0.07318, + "grad_norm": 0.5723028866588152, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 7318 + }, + { + "epoch": 0.07319, + "grad_norm": 0.6057245627345597, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 7319 + }, + { + "epoch": 0.0732, + "grad_norm": 0.7239900990155307, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 7320 + }, + { + "epoch": 0.07321, + "grad_norm": 0.6568845099790048, + "learning_rate": 0.003, + "loss": 4.098, + "step": 7321 + }, + { + "epoch": 0.07322, + "grad_norm": 0.5961986038574478, + "learning_rate": 0.003, + "loss": 4.1146, + "step": 7322 + }, + { + "epoch": 0.07323, + "grad_norm": 0.6111250018556825, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 7323 + }, + { + "epoch": 0.07324, + "grad_norm": 0.6314202107843266, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 7324 + }, + { + "epoch": 0.07325, + "grad_norm": 0.5603304228911395, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 7325 + }, + { + "epoch": 0.07326, + "grad_norm": 0.6198242641884363, + "learning_rate": 0.003, + "loss": 4.112, + "step": 7326 + }, + { + "epoch": 0.07327, + "grad_norm": 0.6180461035522653, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 7327 + }, + { + "epoch": 0.07328, + "grad_norm": 0.5997168347855873, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 7328 + }, + { + "epoch": 0.07329, + "grad_norm": 0.7505933625758654, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 7329 + }, + { + "epoch": 0.0733, + "grad_norm": 0.95882955808798, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 7330 + }, + { + "epoch": 0.07331, + "grad_norm": 1.1625555883916676, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 7331 + }, + { + "epoch": 0.07332, + "grad_norm": 0.713532800458944, + "learning_rate": 0.003, + "loss": 4.118, + "step": 7332 + }, + { + "epoch": 0.07333, + "grad_norm": 0.6381195482238602, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 7333 + }, + { + "epoch": 0.07334, + "grad_norm": 0.5803769095988841, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 7334 + }, + { + "epoch": 0.07335, + "grad_norm": 0.5486263031540451, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 7335 + }, + { + "epoch": 0.07336, + "grad_norm": 0.5662029197744988, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 7336 + }, + { + "epoch": 0.07337, + "grad_norm": 0.4935245740209676, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 7337 + }, + { + "epoch": 0.07338, + "grad_norm": 0.5251534080906687, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 7338 + }, + { + "epoch": 0.07339, + "grad_norm": 0.5426405338587877, + "learning_rate": 0.003, + "loss": 4.1228, + "step": 7339 + }, + { + "epoch": 0.0734, + "grad_norm": 0.572679387243017, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 7340 + }, + { + "epoch": 0.07341, + "grad_norm": 0.592683237309645, + "learning_rate": 0.003, + "loss": 4.1, + "step": 7341 + }, + { + "epoch": 0.07342, + "grad_norm": 0.6784287433146412, + "learning_rate": 0.003, + "loss": 4.1225, + "step": 7342 + }, + { + "epoch": 0.07343, + "grad_norm": 0.8466966833790952, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 7343 + }, + { + "epoch": 0.07344, + "grad_norm": 0.9627040021124741, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 7344 + }, + { + "epoch": 0.07345, + "grad_norm": 0.8840562235128062, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 7345 + }, + { + "epoch": 0.07346, + "grad_norm": 0.7548058738161189, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 7346 + }, + { + "epoch": 0.07347, + "grad_norm": 0.6631446634048735, + "learning_rate": 0.003, + "loss": 4.096, + "step": 7347 + }, + { + "epoch": 0.07348, + "grad_norm": 0.7265237879004127, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 7348 + }, + { + "epoch": 0.07349, + "grad_norm": 0.7506619044246161, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 7349 + }, + { + "epoch": 0.0735, + "grad_norm": 0.8461103544372488, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 7350 + }, + { + "epoch": 0.07351, + "grad_norm": 0.990138732808367, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 7351 + }, + { + "epoch": 0.07352, + "grad_norm": 1.033284964426441, + "learning_rate": 0.003, + "loss": 4.121, + "step": 7352 + }, + { + "epoch": 0.07353, + "grad_norm": 0.9900655416873149, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 7353 + }, + { + "epoch": 0.07354, + "grad_norm": 0.8494895382662214, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 7354 + }, + { + "epoch": 0.07355, + "grad_norm": 0.8514735721257103, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 7355 + }, + { + "epoch": 0.07356, + "grad_norm": 0.7642132298715535, + "learning_rate": 0.003, + "loss": 4.1294, + "step": 7356 + }, + { + "epoch": 0.07357, + "grad_norm": 0.7217630714630613, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 7357 + }, + { + "epoch": 0.07358, + "grad_norm": 0.6372792070079403, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 7358 + }, + { + "epoch": 0.07359, + "grad_norm": 0.6640339672379827, + "learning_rate": 0.003, + "loss": 4.126, + "step": 7359 + }, + { + "epoch": 0.0736, + "grad_norm": 0.6688944780134485, + "learning_rate": 0.003, + "loss": 4.1342, + "step": 7360 + }, + { + "epoch": 0.07361, + "grad_norm": 0.6165260490708869, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 7361 + }, + { + "epoch": 0.07362, + "grad_norm": 0.6046582282651884, + "learning_rate": 0.003, + "loss": 4.1271, + "step": 7362 + }, + { + "epoch": 0.07363, + "grad_norm": 0.5897978442551399, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 7363 + }, + { + "epoch": 0.07364, + "grad_norm": 0.5332996805238572, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 7364 + }, + { + "epoch": 0.07365, + "grad_norm": 0.46010756257859925, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 7365 + }, + { + "epoch": 0.07366, + "grad_norm": 0.5374848389207422, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 7366 + }, + { + "epoch": 0.07367, + "grad_norm": 0.6597380812902667, + "learning_rate": 0.003, + "loss": 4.115, + "step": 7367 + }, + { + "epoch": 0.07368, + "grad_norm": 0.8518139437419292, + "learning_rate": 0.003, + "loss": 4.1366, + "step": 7368 + }, + { + "epoch": 0.07369, + "grad_norm": 0.9618450628071217, + "learning_rate": 0.003, + "loss": 4.09, + "step": 7369 + }, + { + "epoch": 0.0737, + "grad_norm": 0.8620882223010047, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 7370 + }, + { + "epoch": 0.07371, + "grad_norm": 0.6611222884366442, + "learning_rate": 0.003, + "loss": 4.1317, + "step": 7371 + }, + { + "epoch": 0.07372, + "grad_norm": 0.7543698836104294, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 7372 + }, + { + "epoch": 0.07373, + "grad_norm": 0.7294801364711708, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 7373 + }, + { + "epoch": 0.07374, + "grad_norm": 0.6703748242790802, + "learning_rate": 0.003, + "loss": 4.106, + "step": 7374 + }, + { + "epoch": 0.07375, + "grad_norm": 0.6644624708027584, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 7375 + }, + { + "epoch": 0.07376, + "grad_norm": 0.7311681812607845, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 7376 + }, + { + "epoch": 0.07377, + "grad_norm": 0.7075965457515309, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 7377 + }, + { + "epoch": 0.07378, + "grad_norm": 0.6352699526430687, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 7378 + }, + { + "epoch": 0.07379, + "grad_norm": 0.6697856496315371, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 7379 + }, + { + "epoch": 0.0738, + "grad_norm": 0.6784741265654184, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 7380 + }, + { + "epoch": 0.07381, + "grad_norm": 0.5900914351363498, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 7381 + }, + { + "epoch": 0.07382, + "grad_norm": 0.608569663017179, + "learning_rate": 0.003, + "loss": 4.1352, + "step": 7382 + }, + { + "epoch": 0.07383, + "grad_norm": 0.6350237567101997, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 7383 + }, + { + "epoch": 0.07384, + "grad_norm": 0.6424806458733962, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 7384 + }, + { + "epoch": 0.07385, + "grad_norm": 0.6720061518073006, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 7385 + }, + { + "epoch": 0.07386, + "grad_norm": 0.7795027909968454, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 7386 + }, + { + "epoch": 0.07387, + "grad_norm": 0.8143599209673019, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 7387 + }, + { + "epoch": 0.07388, + "grad_norm": 0.7807403224128945, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 7388 + }, + { + "epoch": 0.07389, + "grad_norm": 0.6943770705390825, + "learning_rate": 0.003, + "loss": 4.097, + "step": 7389 + }, + { + "epoch": 0.0739, + "grad_norm": 0.6749062613953275, + "learning_rate": 0.003, + "loss": 4.089, + "step": 7390 + }, + { + "epoch": 0.07391, + "grad_norm": 0.6980025211034937, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 7391 + }, + { + "epoch": 0.07392, + "grad_norm": 0.8142287222162783, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 7392 + }, + { + "epoch": 0.07393, + "grad_norm": 0.848080762828504, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 7393 + }, + { + "epoch": 0.07394, + "grad_norm": 0.7052754295757714, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 7394 + }, + { + "epoch": 0.07395, + "grad_norm": 0.6918679030912211, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 7395 + }, + { + "epoch": 0.07396, + "grad_norm": 0.6396360507546505, + "learning_rate": 0.003, + "loss": 4.101, + "step": 7396 + }, + { + "epoch": 0.07397, + "grad_norm": 0.5575428300488946, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 7397 + }, + { + "epoch": 0.07398, + "grad_norm": 0.47819343358592886, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 7398 + }, + { + "epoch": 0.07399, + "grad_norm": 0.49123569869531275, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 7399 + }, + { + "epoch": 0.074, + "grad_norm": 0.4873818583702739, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 7400 + }, + { + "epoch": 0.07401, + "grad_norm": 0.5338185744914158, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 7401 + }, + { + "epoch": 0.07402, + "grad_norm": 0.6259981261871912, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 7402 + }, + { + "epoch": 0.07403, + "grad_norm": 0.7875202985260396, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 7403 + }, + { + "epoch": 0.07404, + "grad_norm": 0.9375888555588073, + "learning_rate": 0.003, + "loss": 4.148, + "step": 7404 + }, + { + "epoch": 0.07405, + "grad_norm": 0.916338808307593, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 7405 + }, + { + "epoch": 0.07406, + "grad_norm": 0.7809170835167528, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 7406 + }, + { + "epoch": 0.07407, + "grad_norm": 0.6654944055402259, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 7407 + }, + { + "epoch": 0.07408, + "grad_norm": 0.6915372132184653, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 7408 + }, + { + "epoch": 0.07409, + "grad_norm": 0.7828828066535588, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 7409 + }, + { + "epoch": 0.0741, + "grad_norm": 0.8226790763634696, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 7410 + }, + { + "epoch": 0.07411, + "grad_norm": 0.8943168078020164, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 7411 + }, + { + "epoch": 0.07412, + "grad_norm": 0.8016790470869424, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 7412 + }, + { + "epoch": 0.07413, + "grad_norm": 0.7242923730785583, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 7413 + }, + { + "epoch": 0.07414, + "grad_norm": 0.7803236835873817, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 7414 + }, + { + "epoch": 0.07415, + "grad_norm": 0.809440770991952, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 7415 + }, + { + "epoch": 0.07416, + "grad_norm": 0.814374589546072, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 7416 + }, + { + "epoch": 0.07417, + "grad_norm": 0.7472307812051249, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 7417 + }, + { + "epoch": 0.07418, + "grad_norm": 0.668144266675147, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 7418 + }, + { + "epoch": 0.07419, + "grad_norm": 0.7132902738686082, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 7419 + }, + { + "epoch": 0.0742, + "grad_norm": 0.6426183694835322, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 7420 + }, + { + "epoch": 0.07421, + "grad_norm": 0.6842452822488992, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 7421 + }, + { + "epoch": 0.07422, + "grad_norm": 0.6935109936445435, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 7422 + }, + { + "epoch": 0.07423, + "grad_norm": 0.6977420494019977, + "learning_rate": 0.003, + "loss": 4.092, + "step": 7423 + }, + { + "epoch": 0.07424, + "grad_norm": 0.66836051737428, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 7424 + }, + { + "epoch": 0.07425, + "grad_norm": 0.6071918274550937, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 7425 + }, + { + "epoch": 0.07426, + "grad_norm": 0.5672292395284034, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 7426 + }, + { + "epoch": 0.07427, + "grad_norm": 0.6881340403118316, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 7427 + }, + { + "epoch": 0.07428, + "grad_norm": 0.8295995612479321, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 7428 + }, + { + "epoch": 0.07429, + "grad_norm": 1.0310591530396886, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 7429 + }, + { + "epoch": 0.0743, + "grad_norm": 1.1113844190744362, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 7430 + }, + { + "epoch": 0.07431, + "grad_norm": 0.8368950095459845, + "learning_rate": 0.003, + "loss": 4.101, + "step": 7431 + }, + { + "epoch": 0.07432, + "grad_norm": 0.7370171566843224, + "learning_rate": 0.003, + "loss": 4.121, + "step": 7432 + }, + { + "epoch": 0.07433, + "grad_norm": 0.7186305212736634, + "learning_rate": 0.003, + "loss": 4.1063, + "step": 7433 + }, + { + "epoch": 0.07434, + "grad_norm": 0.6592511244859814, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 7434 + }, + { + "epoch": 0.07435, + "grad_norm": 0.6659908164403538, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 7435 + }, + { + "epoch": 0.07436, + "grad_norm": 0.7024784942727049, + "learning_rate": 0.003, + "loss": 4.134, + "step": 7436 + }, + { + "epoch": 0.07437, + "grad_norm": 0.750590282273592, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 7437 + }, + { + "epoch": 0.07438, + "grad_norm": 0.7342624547457341, + "learning_rate": 0.003, + "loss": 4.117, + "step": 7438 + }, + { + "epoch": 0.07439, + "grad_norm": 0.6552175598983562, + "learning_rate": 0.003, + "loss": 4.1017, + "step": 7439 + }, + { + "epoch": 0.0744, + "grad_norm": 0.6670994817257487, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 7440 + }, + { + "epoch": 0.07441, + "grad_norm": 0.5868493465333204, + "learning_rate": 0.003, + "loss": 4.1391, + "step": 7441 + }, + { + "epoch": 0.07442, + "grad_norm": 0.5749846079049408, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 7442 + }, + { + "epoch": 0.07443, + "grad_norm": 0.616246467740031, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 7443 + }, + { + "epoch": 0.07444, + "grad_norm": 0.6314116396577024, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 7444 + }, + { + "epoch": 0.07445, + "grad_norm": 0.6891618905408962, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 7445 + }, + { + "epoch": 0.07446, + "grad_norm": 0.7665111909100017, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 7446 + }, + { + "epoch": 0.07447, + "grad_norm": 0.7960586954783623, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 7447 + }, + { + "epoch": 0.07448, + "grad_norm": 0.6932171237233758, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 7448 + }, + { + "epoch": 0.07449, + "grad_norm": 0.555157340249593, + "learning_rate": 0.003, + "loss": 4.099, + "step": 7449 + }, + { + "epoch": 0.0745, + "grad_norm": 0.5727726738711602, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 7450 + }, + { + "epoch": 0.07451, + "grad_norm": 0.6677189755758194, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 7451 + }, + { + "epoch": 0.07452, + "grad_norm": 0.7461108960326893, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 7452 + }, + { + "epoch": 0.07453, + "grad_norm": 0.7689716942440091, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 7453 + }, + { + "epoch": 0.07454, + "grad_norm": 0.6847633220363298, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 7454 + }, + { + "epoch": 0.07455, + "grad_norm": 0.6834076703366209, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 7455 + }, + { + "epoch": 0.07456, + "grad_norm": 0.5915374732547324, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 7456 + }, + { + "epoch": 0.07457, + "grad_norm": 0.5353509342865896, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 7457 + }, + { + "epoch": 0.07458, + "grad_norm": 0.6021411618752377, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 7458 + }, + { + "epoch": 0.07459, + "grad_norm": 0.6164057139569667, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 7459 + }, + { + "epoch": 0.0746, + "grad_norm": 0.7212773204762993, + "learning_rate": 0.003, + "loss": 4.104, + "step": 7460 + }, + { + "epoch": 0.07461, + "grad_norm": 0.7976317064628464, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 7461 + }, + { + "epoch": 0.07462, + "grad_norm": 0.8064167886203844, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 7462 + }, + { + "epoch": 0.07463, + "grad_norm": 0.7632409751906114, + "learning_rate": 0.003, + "loss": 4.1378, + "step": 7463 + }, + { + "epoch": 0.07464, + "grad_norm": 0.7512717490057481, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 7464 + }, + { + "epoch": 0.07465, + "grad_norm": 0.7943363392230436, + "learning_rate": 0.003, + "loss": 4.107, + "step": 7465 + }, + { + "epoch": 0.07466, + "grad_norm": 0.8637427260824427, + "learning_rate": 0.003, + "loss": 4.1269, + "step": 7466 + }, + { + "epoch": 0.07467, + "grad_norm": 0.8498740214887592, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 7467 + }, + { + "epoch": 0.07468, + "grad_norm": 0.8357355406639847, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 7468 + }, + { + "epoch": 0.07469, + "grad_norm": 0.8142934568301395, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 7469 + }, + { + "epoch": 0.0747, + "grad_norm": 0.8503882150265013, + "learning_rate": 0.003, + "loss": 4.1361, + "step": 7470 + }, + { + "epoch": 0.07471, + "grad_norm": 0.8925914077857183, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 7471 + }, + { + "epoch": 0.07472, + "grad_norm": 0.9101180407681418, + "learning_rate": 0.003, + "loss": 4.1274, + "step": 7472 + }, + { + "epoch": 0.07473, + "grad_norm": 0.8580441555622639, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 7473 + }, + { + "epoch": 0.07474, + "grad_norm": 0.8896005983371758, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 7474 + }, + { + "epoch": 0.07475, + "grad_norm": 0.7824169038965194, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 7475 + }, + { + "epoch": 0.07476, + "grad_norm": 0.7710890200443516, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 7476 + }, + { + "epoch": 0.07477, + "grad_norm": 0.7900469190579128, + "learning_rate": 0.003, + "loss": 4.127, + "step": 7477 + }, + { + "epoch": 0.07478, + "grad_norm": 0.6977697020555949, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 7478 + }, + { + "epoch": 0.07479, + "grad_norm": 0.6860967351037519, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 7479 + }, + { + "epoch": 0.0748, + "grad_norm": 0.6740365801183335, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 7480 + }, + { + "epoch": 0.07481, + "grad_norm": 0.7756716209864886, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 7481 + }, + { + "epoch": 0.07482, + "grad_norm": 0.7133092139283264, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 7482 + }, + { + "epoch": 0.07483, + "grad_norm": 0.6326111223603825, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 7483 + }, + { + "epoch": 0.07484, + "grad_norm": 0.5282476293532727, + "learning_rate": 0.003, + "loss": 4.1102, + "step": 7484 + }, + { + "epoch": 0.07485, + "grad_norm": 0.5706931649668086, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 7485 + }, + { + "epoch": 0.07486, + "grad_norm": 0.606269213255391, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 7486 + }, + { + "epoch": 0.07487, + "grad_norm": 0.6105645025729305, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 7487 + }, + { + "epoch": 0.07488, + "grad_norm": 0.6237051677171727, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 7488 + }, + { + "epoch": 0.07489, + "grad_norm": 0.7811835813919411, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 7489 + }, + { + "epoch": 0.0749, + "grad_norm": 0.7579316329950212, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 7490 + }, + { + "epoch": 0.07491, + "grad_norm": 0.6336904857400839, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 7491 + }, + { + "epoch": 0.07492, + "grad_norm": 0.5762678308899742, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 7492 + }, + { + "epoch": 0.07493, + "grad_norm": 0.5896176068966966, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 7493 + }, + { + "epoch": 0.07494, + "grad_norm": 0.5953859444943526, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 7494 + }, + { + "epoch": 0.07495, + "grad_norm": 0.6189501827559954, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 7495 + }, + { + "epoch": 0.07496, + "grad_norm": 0.6476212217965059, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 7496 + }, + { + "epoch": 0.07497, + "grad_norm": 0.590017860504856, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 7497 + }, + { + "epoch": 0.07498, + "grad_norm": 0.5711936608923054, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 7498 + }, + { + "epoch": 0.07499, + "grad_norm": 0.6570356392413452, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 7499 + }, + { + "epoch": 0.075, + "grad_norm": 0.6759155863873976, + "learning_rate": 0.003, + "loss": 4.1151, + "step": 7500 + }, + { + "epoch": 0.07501, + "grad_norm": 0.8110049470072964, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 7501 + }, + { + "epoch": 0.07502, + "grad_norm": 0.9727707162810135, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 7502 + }, + { + "epoch": 0.07503, + "grad_norm": 1.1150877882227244, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 7503 + }, + { + "epoch": 0.07504, + "grad_norm": 0.7577757721039465, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 7504 + }, + { + "epoch": 0.07505, + "grad_norm": 0.6742452888810195, + "learning_rate": 0.003, + "loss": 4.085, + "step": 7505 + }, + { + "epoch": 0.07506, + "grad_norm": 0.6189390428317627, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 7506 + }, + { + "epoch": 0.07507, + "grad_norm": 0.6022087392016671, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 7507 + }, + { + "epoch": 0.07508, + "grad_norm": 0.5309580523879476, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 7508 + }, + { + "epoch": 0.07509, + "grad_norm": 0.5484502948250853, + "learning_rate": 0.003, + "loss": 4.096, + "step": 7509 + }, + { + "epoch": 0.0751, + "grad_norm": 0.6594506505512988, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 7510 + }, + { + "epoch": 0.07511, + "grad_norm": 0.6599050252229222, + "learning_rate": 0.003, + "loss": 4.084, + "step": 7511 + }, + { + "epoch": 0.07512, + "grad_norm": 0.5970442662703767, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 7512 + }, + { + "epoch": 0.07513, + "grad_norm": 0.5770492421013043, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 7513 + }, + { + "epoch": 0.07514, + "grad_norm": 0.5972414190275317, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 7514 + }, + { + "epoch": 0.07515, + "grad_norm": 0.5236448183039198, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 7515 + }, + { + "epoch": 0.07516, + "grad_norm": 0.491529472938158, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 7516 + }, + { + "epoch": 0.07517, + "grad_norm": 0.5269178228921855, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 7517 + }, + { + "epoch": 0.07518, + "grad_norm": 0.49875426784289123, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 7518 + }, + { + "epoch": 0.07519, + "grad_norm": 0.4804224094730234, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 7519 + }, + { + "epoch": 0.0752, + "grad_norm": 0.4238021009088441, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 7520 + }, + { + "epoch": 0.07521, + "grad_norm": 0.4356047629744244, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 7521 + }, + { + "epoch": 0.07522, + "grad_norm": 0.4848019612577865, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 7522 + }, + { + "epoch": 0.07523, + "grad_norm": 0.5676769876951443, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 7523 + }, + { + "epoch": 0.07524, + "grad_norm": 0.7969402415607215, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 7524 + }, + { + "epoch": 0.07525, + "grad_norm": 1.158159401601464, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 7525 + }, + { + "epoch": 0.07526, + "grad_norm": 0.9956964848184818, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 7526 + }, + { + "epoch": 0.07527, + "grad_norm": 1.040836474640964, + "learning_rate": 0.003, + "loss": 4.1372, + "step": 7527 + }, + { + "epoch": 0.07528, + "grad_norm": 0.9140741004270626, + "learning_rate": 0.003, + "loss": 4.1343, + "step": 7528 + }, + { + "epoch": 0.07529, + "grad_norm": 1.0052666855509356, + "learning_rate": 0.003, + "loss": 4.114, + "step": 7529 + }, + { + "epoch": 0.0753, + "grad_norm": 0.9481670539724812, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 7530 + }, + { + "epoch": 0.07531, + "grad_norm": 0.9918379168573455, + "learning_rate": 0.003, + "loss": 4.1468, + "step": 7531 + }, + { + "epoch": 0.07532, + "grad_norm": 1.0229542004769954, + "learning_rate": 0.003, + "loss": 4.1396, + "step": 7532 + }, + { + "epoch": 0.07533, + "grad_norm": 1.0918782376363978, + "learning_rate": 0.003, + "loss": 4.1343, + "step": 7533 + }, + { + "epoch": 0.07534, + "grad_norm": 0.9604541131474513, + "learning_rate": 0.003, + "loss": 4.1352, + "step": 7534 + }, + { + "epoch": 0.07535, + "grad_norm": 1.0009685102812784, + "learning_rate": 0.003, + "loss": 4.1491, + "step": 7535 + }, + { + "epoch": 0.07536, + "grad_norm": 1.0487571004898342, + "learning_rate": 0.003, + "loss": 4.1362, + "step": 7536 + }, + { + "epoch": 0.07537, + "grad_norm": 0.9991469202887211, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 7537 + }, + { + "epoch": 0.07538, + "grad_norm": 0.8819852519452202, + "learning_rate": 0.003, + "loss": 4.1478, + "step": 7538 + }, + { + "epoch": 0.07539, + "grad_norm": 0.8807197730769387, + "learning_rate": 0.003, + "loss": 4.119, + "step": 7539 + }, + { + "epoch": 0.0754, + "grad_norm": 0.877244025946411, + "learning_rate": 0.003, + "loss": 4.1383, + "step": 7540 + }, + { + "epoch": 0.07541, + "grad_norm": 0.9740029088436687, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 7541 + }, + { + "epoch": 0.07542, + "grad_norm": 0.8600848931909216, + "learning_rate": 0.003, + "loss": 4.1355, + "step": 7542 + }, + { + "epoch": 0.07543, + "grad_norm": 0.8280886875133882, + "learning_rate": 0.003, + "loss": 4.1584, + "step": 7543 + }, + { + "epoch": 0.07544, + "grad_norm": 0.8071984724777687, + "learning_rate": 0.003, + "loss": 4.1459, + "step": 7544 + }, + { + "epoch": 0.07545, + "grad_norm": 0.698095628354796, + "learning_rate": 0.003, + "loss": 4.1652, + "step": 7545 + }, + { + "epoch": 0.07546, + "grad_norm": 0.6402526854429936, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 7546 + }, + { + "epoch": 0.07547, + "grad_norm": 0.6536241524830555, + "learning_rate": 0.003, + "loss": 4.1416, + "step": 7547 + }, + { + "epoch": 0.07548, + "grad_norm": 0.5657943886646665, + "learning_rate": 0.003, + "loss": 4.1467, + "step": 7548 + }, + { + "epoch": 0.07549, + "grad_norm": 0.6076370507493793, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 7549 + }, + { + "epoch": 0.0755, + "grad_norm": 0.5883507720311228, + "learning_rate": 0.003, + "loss": 4.1397, + "step": 7550 + }, + { + "epoch": 0.07551, + "grad_norm": 0.619610373235812, + "learning_rate": 0.003, + "loss": 4.127, + "step": 7551 + }, + { + "epoch": 0.07552, + "grad_norm": 0.7123411482561006, + "learning_rate": 0.003, + "loss": 4.13, + "step": 7552 + }, + { + "epoch": 0.07553, + "grad_norm": 0.6469950236749169, + "learning_rate": 0.003, + "loss": 4.1317, + "step": 7553 + }, + { + "epoch": 0.07554, + "grad_norm": 0.6316235881426905, + "learning_rate": 0.003, + "loss": 4.1247, + "step": 7554 + }, + { + "epoch": 0.07555, + "grad_norm": 0.7256901233443528, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 7555 + }, + { + "epoch": 0.07556, + "grad_norm": 0.7618410256376742, + "learning_rate": 0.003, + "loss": 4.1341, + "step": 7556 + }, + { + "epoch": 0.07557, + "grad_norm": 0.7239253341070478, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 7557 + }, + { + "epoch": 0.07558, + "grad_norm": 0.6931338512410227, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 7558 + }, + { + "epoch": 0.07559, + "grad_norm": 0.6724575608591329, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 7559 + }, + { + "epoch": 0.0756, + "grad_norm": 0.7061480953108663, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 7560 + }, + { + "epoch": 0.07561, + "grad_norm": 0.7787438183497947, + "learning_rate": 0.003, + "loss": 4.103, + "step": 7561 + }, + { + "epoch": 0.07562, + "grad_norm": 0.9044825545181286, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 7562 + }, + { + "epoch": 0.07563, + "grad_norm": 0.9934288625594392, + "learning_rate": 0.003, + "loss": 4.1225, + "step": 7563 + }, + { + "epoch": 0.07564, + "grad_norm": 0.8635651395659641, + "learning_rate": 0.003, + "loss": 4.1388, + "step": 7564 + }, + { + "epoch": 0.07565, + "grad_norm": 0.6895282281140435, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 7565 + }, + { + "epoch": 0.07566, + "grad_norm": 0.5363646891497745, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 7566 + }, + { + "epoch": 0.07567, + "grad_norm": 0.5387866394074117, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 7567 + }, + { + "epoch": 0.07568, + "grad_norm": 0.5090019275065167, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 7568 + }, + { + "epoch": 0.07569, + "grad_norm": 0.42405385696904213, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 7569 + }, + { + "epoch": 0.0757, + "grad_norm": 0.4110011928681986, + "learning_rate": 0.003, + "loss": 4.083, + "step": 7570 + }, + { + "epoch": 0.07571, + "grad_norm": 0.4413812175592004, + "learning_rate": 0.003, + "loss": 4.075, + "step": 7571 + }, + { + "epoch": 0.07572, + "grad_norm": 0.4976749366359514, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 7572 + }, + { + "epoch": 0.07573, + "grad_norm": 0.5537972928779329, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 7573 + }, + { + "epoch": 0.07574, + "grad_norm": 0.5308733184384575, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 7574 + }, + { + "epoch": 0.07575, + "grad_norm": 0.49041822167560783, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 7575 + }, + { + "epoch": 0.07576, + "grad_norm": 0.5081385300005833, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 7576 + }, + { + "epoch": 0.07577, + "grad_norm": 0.574336572955178, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 7577 + }, + { + "epoch": 0.07578, + "grad_norm": 0.5956477110919803, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 7578 + }, + { + "epoch": 0.07579, + "grad_norm": 0.6349724495359681, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 7579 + }, + { + "epoch": 0.0758, + "grad_norm": 0.5747733937772262, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 7580 + }, + { + "epoch": 0.07581, + "grad_norm": 0.5324184467786447, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 7581 + }, + { + "epoch": 0.07582, + "grad_norm": 0.5131944070203941, + "learning_rate": 0.003, + "loss": 4.088, + "step": 7582 + }, + { + "epoch": 0.07583, + "grad_norm": 0.5085404496587494, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 7583 + }, + { + "epoch": 0.07584, + "grad_norm": 0.541822111982503, + "learning_rate": 0.003, + "loss": 4.131, + "step": 7584 + }, + { + "epoch": 0.07585, + "grad_norm": 0.61290315130478, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 7585 + }, + { + "epoch": 0.07586, + "grad_norm": 0.7597528207897615, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 7586 + }, + { + "epoch": 0.07587, + "grad_norm": 1.0042272950879196, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 7587 + }, + { + "epoch": 0.07588, + "grad_norm": 1.1991037951904149, + "learning_rate": 0.003, + "loss": 4.1146, + "step": 7588 + }, + { + "epoch": 0.07589, + "grad_norm": 0.7285232286835313, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 7589 + }, + { + "epoch": 0.0759, + "grad_norm": 0.7148833074382797, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 7590 + }, + { + "epoch": 0.07591, + "grad_norm": 0.7201822884141769, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 7591 + }, + { + "epoch": 0.07592, + "grad_norm": 0.6076702679439346, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 7592 + }, + { + "epoch": 0.07593, + "grad_norm": 0.5718081339296276, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 7593 + }, + { + "epoch": 0.07594, + "grad_norm": 0.6158587196440879, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 7594 + }, + { + "epoch": 0.07595, + "grad_norm": 0.6174048562726181, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 7595 + }, + { + "epoch": 0.07596, + "grad_norm": 0.6776269755619764, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 7596 + }, + { + "epoch": 0.07597, + "grad_norm": 0.7579861091139625, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 7597 + }, + { + "epoch": 0.07598, + "grad_norm": 0.821293646598518, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 7598 + }, + { + "epoch": 0.07599, + "grad_norm": 0.9172481452747592, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 7599 + }, + { + "epoch": 0.076, + "grad_norm": 0.9188185947900437, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 7600 + }, + { + "epoch": 0.07601, + "grad_norm": 0.8582002375162606, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 7601 + }, + { + "epoch": 0.07602, + "grad_norm": 0.7498431640329858, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 7602 + }, + { + "epoch": 0.07603, + "grad_norm": 0.727413692498518, + "learning_rate": 0.003, + "loss": 4.1063, + "step": 7603 + }, + { + "epoch": 0.07604, + "grad_norm": 0.7128563866226183, + "learning_rate": 0.003, + "loss": 4.1188, + "step": 7604 + }, + { + "epoch": 0.07605, + "grad_norm": 0.6925666833494799, + "learning_rate": 0.003, + "loss": 4.108, + "step": 7605 + }, + { + "epoch": 0.07606, + "grad_norm": 0.8051319465023743, + "learning_rate": 0.003, + "loss": 4.1653, + "step": 7606 + }, + { + "epoch": 0.07607, + "grad_norm": 0.8182132405488027, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 7607 + }, + { + "epoch": 0.07608, + "grad_norm": 0.8255552624586233, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 7608 + }, + { + "epoch": 0.07609, + "grad_norm": 0.7970747267263344, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 7609 + }, + { + "epoch": 0.0761, + "grad_norm": 0.8035430769234121, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 7610 + }, + { + "epoch": 0.07611, + "grad_norm": 0.7414024082233376, + "learning_rate": 0.003, + "loss": 4.1343, + "step": 7611 + }, + { + "epoch": 0.07612, + "grad_norm": 0.6738484686209927, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 7612 + }, + { + "epoch": 0.07613, + "grad_norm": 0.6313697370107805, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 7613 + }, + { + "epoch": 0.07614, + "grad_norm": 0.6779404296006304, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 7614 + }, + { + "epoch": 0.07615, + "grad_norm": 0.671043709478893, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 7615 + }, + { + "epoch": 0.07616, + "grad_norm": 0.648522037704203, + "learning_rate": 0.003, + "loss": 4.103, + "step": 7616 + }, + { + "epoch": 0.07617, + "grad_norm": 0.6992220289841787, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 7617 + }, + { + "epoch": 0.07618, + "grad_norm": 0.7961453102996263, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 7618 + }, + { + "epoch": 0.07619, + "grad_norm": 0.6675342147329035, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 7619 + }, + { + "epoch": 0.0762, + "grad_norm": 0.5717583537378645, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 7620 + }, + { + "epoch": 0.07621, + "grad_norm": 0.5677927273573751, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 7621 + }, + { + "epoch": 0.07622, + "grad_norm": 0.6433993271028621, + "learning_rate": 0.003, + "loss": 4.1138, + "step": 7622 + }, + { + "epoch": 0.07623, + "grad_norm": 0.6940232002524971, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 7623 + }, + { + "epoch": 0.07624, + "grad_norm": 0.7880990684776099, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 7624 + }, + { + "epoch": 0.07625, + "grad_norm": 0.8639480293754638, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 7625 + }, + { + "epoch": 0.07626, + "grad_norm": 0.8696121558470246, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 7626 + }, + { + "epoch": 0.07627, + "grad_norm": 0.6599012014594735, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 7627 + }, + { + "epoch": 0.07628, + "grad_norm": 0.6055187589728076, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 7628 + }, + { + "epoch": 0.07629, + "grad_norm": 0.5602974302890468, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 7629 + }, + { + "epoch": 0.0763, + "grad_norm": 0.5483897493872879, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 7630 + }, + { + "epoch": 0.07631, + "grad_norm": 0.5313336121851904, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 7631 + }, + { + "epoch": 0.07632, + "grad_norm": 0.4946584392262865, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 7632 + }, + { + "epoch": 0.07633, + "grad_norm": 0.4679730906531611, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 7633 + }, + { + "epoch": 0.07634, + "grad_norm": 0.5163571664886781, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 7634 + }, + { + "epoch": 0.07635, + "grad_norm": 0.5766441343095514, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 7635 + }, + { + "epoch": 0.07636, + "grad_norm": 0.6356527311929989, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 7636 + }, + { + "epoch": 0.07637, + "grad_norm": 0.7120458591171308, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 7637 + }, + { + "epoch": 0.07638, + "grad_norm": 0.7256150029226924, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 7638 + }, + { + "epoch": 0.07639, + "grad_norm": 0.6685928629135915, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 7639 + }, + { + "epoch": 0.0764, + "grad_norm": 0.7914866586728011, + "learning_rate": 0.003, + "loss": 4.1354, + "step": 7640 + }, + { + "epoch": 0.07641, + "grad_norm": 0.9183061054040322, + "learning_rate": 0.003, + "loss": 4.1304, + "step": 7641 + }, + { + "epoch": 0.07642, + "grad_norm": 0.8815667821813041, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 7642 + }, + { + "epoch": 0.07643, + "grad_norm": 0.6789085139539063, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 7643 + }, + { + "epoch": 0.07644, + "grad_norm": 0.6670039351921899, + "learning_rate": 0.003, + "loss": 4.079, + "step": 7644 + }, + { + "epoch": 0.07645, + "grad_norm": 0.6793245893987427, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 7645 + }, + { + "epoch": 0.07646, + "grad_norm": 0.6629748478851417, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 7646 + }, + { + "epoch": 0.07647, + "grad_norm": 0.7288187662625558, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 7647 + }, + { + "epoch": 0.07648, + "grad_norm": 0.889407265953888, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 7648 + }, + { + "epoch": 0.07649, + "grad_norm": 0.9687671903338628, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 7649 + }, + { + "epoch": 0.0765, + "grad_norm": 0.8049244708990047, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 7650 + }, + { + "epoch": 0.07651, + "grad_norm": 0.749604908748907, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 7651 + }, + { + "epoch": 0.07652, + "grad_norm": 0.7585566864542461, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 7652 + }, + { + "epoch": 0.07653, + "grad_norm": 0.9516693847937671, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 7653 + }, + { + "epoch": 0.07654, + "grad_norm": 1.0530156043085126, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 7654 + }, + { + "epoch": 0.07655, + "grad_norm": 0.8372129657311617, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 7655 + }, + { + "epoch": 0.07656, + "grad_norm": 0.7407223152315993, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 7656 + }, + { + "epoch": 0.07657, + "grad_norm": 0.7336956785627904, + "learning_rate": 0.003, + "loss": 4.1262, + "step": 7657 + }, + { + "epoch": 0.07658, + "grad_norm": 0.672687938805935, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 7658 + }, + { + "epoch": 0.07659, + "grad_norm": 0.7385569673711774, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 7659 + }, + { + "epoch": 0.0766, + "grad_norm": 0.8172458837525138, + "learning_rate": 0.003, + "loss": 4.1486, + "step": 7660 + }, + { + "epoch": 0.07661, + "grad_norm": 0.9105650740952083, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 7661 + }, + { + "epoch": 0.07662, + "grad_norm": 0.9795402229933363, + "learning_rate": 0.003, + "loss": 4.1453, + "step": 7662 + }, + { + "epoch": 0.07663, + "grad_norm": 0.9659565857187145, + "learning_rate": 0.003, + "loss": 4.1485, + "step": 7663 + }, + { + "epoch": 0.07664, + "grad_norm": 0.7426641164866011, + "learning_rate": 0.003, + "loss": 4.1308, + "step": 7664 + }, + { + "epoch": 0.07665, + "grad_norm": 0.7654812929638697, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 7665 + }, + { + "epoch": 0.07666, + "grad_norm": 0.7454710802815059, + "learning_rate": 0.003, + "loss": 4.1182, + "step": 7666 + }, + { + "epoch": 0.07667, + "grad_norm": 0.7064554873587587, + "learning_rate": 0.003, + "loss": 4.1243, + "step": 7667 + }, + { + "epoch": 0.07668, + "grad_norm": 0.6257727862295976, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 7668 + }, + { + "epoch": 0.07669, + "grad_norm": 0.5710711731797409, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 7669 + }, + { + "epoch": 0.0767, + "grad_norm": 0.528257520997173, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 7670 + }, + { + "epoch": 0.07671, + "grad_norm": 0.4959008285709095, + "learning_rate": 0.003, + "loss": 4.1235, + "step": 7671 + }, + { + "epoch": 0.07672, + "grad_norm": 0.45441825603578956, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 7672 + }, + { + "epoch": 0.07673, + "grad_norm": 0.4429820345077854, + "learning_rate": 0.003, + "loss": 4.1317, + "step": 7673 + }, + { + "epoch": 0.07674, + "grad_norm": 0.42319123978862466, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 7674 + }, + { + "epoch": 0.07675, + "grad_norm": 0.3845278102823501, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 7675 + }, + { + "epoch": 0.07676, + "grad_norm": 0.41229574434392646, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 7676 + }, + { + "epoch": 0.07677, + "grad_norm": 0.43520964656733063, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 7677 + }, + { + "epoch": 0.07678, + "grad_norm": 0.44518003939309553, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 7678 + }, + { + "epoch": 0.07679, + "grad_norm": 0.5155228564191591, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 7679 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5596883999233748, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 7680 + }, + { + "epoch": 0.07681, + "grad_norm": 0.5906511007621373, + "learning_rate": 0.003, + "loss": 4.097, + "step": 7681 + }, + { + "epoch": 0.07682, + "grad_norm": 0.6495617626958452, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 7682 + }, + { + "epoch": 0.07683, + "grad_norm": 0.813132646521943, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 7683 + }, + { + "epoch": 0.07684, + "grad_norm": 0.9701002240236403, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 7684 + }, + { + "epoch": 0.07685, + "grad_norm": 1.0224940781449272, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 7685 + }, + { + "epoch": 0.07686, + "grad_norm": 0.8278902572767264, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 7686 + }, + { + "epoch": 0.07687, + "grad_norm": 0.8042154248471763, + "learning_rate": 0.003, + "loss": 4.116, + "step": 7687 + }, + { + "epoch": 0.07688, + "grad_norm": 0.8028648725581159, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 7688 + }, + { + "epoch": 0.07689, + "grad_norm": 0.7584652571831005, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 7689 + }, + { + "epoch": 0.0769, + "grad_norm": 0.668275319980837, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 7690 + }, + { + "epoch": 0.07691, + "grad_norm": 0.7293348633493727, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 7691 + }, + { + "epoch": 0.07692, + "grad_norm": 0.72632873093583, + "learning_rate": 0.003, + "loss": 4.1244, + "step": 7692 + }, + { + "epoch": 0.07693, + "grad_norm": 0.7772222742789282, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 7693 + }, + { + "epoch": 0.07694, + "grad_norm": 0.844518033365835, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 7694 + }, + { + "epoch": 0.07695, + "grad_norm": 1.0562783279259773, + "learning_rate": 0.003, + "loss": 4.1359, + "step": 7695 + }, + { + "epoch": 0.07696, + "grad_norm": 0.9678281861932773, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 7696 + }, + { + "epoch": 0.07697, + "grad_norm": 0.8583167779532994, + "learning_rate": 0.003, + "loss": 4.1507, + "step": 7697 + }, + { + "epoch": 0.07698, + "grad_norm": 0.7254348536667942, + "learning_rate": 0.003, + "loss": 4.1198, + "step": 7698 + }, + { + "epoch": 0.07699, + "grad_norm": 0.7025270777281947, + "learning_rate": 0.003, + "loss": 4.147, + "step": 7699 + }, + { + "epoch": 0.077, + "grad_norm": 0.6298853792744631, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 7700 + }, + { + "epoch": 0.07701, + "grad_norm": 0.6529473251497263, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 7701 + }, + { + "epoch": 0.07702, + "grad_norm": 0.6618947976576033, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 7702 + }, + { + "epoch": 0.07703, + "grad_norm": 0.720492970627025, + "learning_rate": 0.003, + "loss": 4.1173, + "step": 7703 + }, + { + "epoch": 0.07704, + "grad_norm": 0.7521627176606427, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 7704 + }, + { + "epoch": 0.07705, + "grad_norm": 0.6974134473890328, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 7705 + }, + { + "epoch": 0.07706, + "grad_norm": 0.6519559813236994, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 7706 + }, + { + "epoch": 0.07707, + "grad_norm": 0.7138065901393107, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 7707 + }, + { + "epoch": 0.07708, + "grad_norm": 0.7603897989016979, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 7708 + }, + { + "epoch": 0.07709, + "grad_norm": 0.7728171935175017, + "learning_rate": 0.003, + "loss": 4.1131, + "step": 7709 + }, + { + "epoch": 0.0771, + "grad_norm": 0.6846056023357585, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 7710 + }, + { + "epoch": 0.07711, + "grad_norm": 0.6649417137508592, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 7711 + }, + { + "epoch": 0.07712, + "grad_norm": 0.7242895985336029, + "learning_rate": 0.003, + "loss": 4.1017, + "step": 7712 + }, + { + "epoch": 0.07713, + "grad_norm": 0.7939906705321689, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 7713 + }, + { + "epoch": 0.07714, + "grad_norm": 0.8212321036889528, + "learning_rate": 0.003, + "loss": 4.08, + "step": 7714 + }, + { + "epoch": 0.07715, + "grad_norm": 0.8121576436599612, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 7715 + }, + { + "epoch": 0.07716, + "grad_norm": 0.7330469856352435, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 7716 + }, + { + "epoch": 0.07717, + "grad_norm": 0.6433922965206615, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 7717 + }, + { + "epoch": 0.07718, + "grad_norm": 0.5861977103583862, + "learning_rate": 0.003, + "loss": 4.1255, + "step": 7718 + }, + { + "epoch": 0.07719, + "grad_norm": 0.5228180617616236, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 7719 + }, + { + "epoch": 0.0772, + "grad_norm": 0.4876233685514838, + "learning_rate": 0.003, + "loss": 4.1222, + "step": 7720 + }, + { + "epoch": 0.07721, + "grad_norm": 0.5225866371833879, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 7721 + }, + { + "epoch": 0.07722, + "grad_norm": 0.49420226274527473, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 7722 + }, + { + "epoch": 0.07723, + "grad_norm": 0.5163143220392981, + "learning_rate": 0.003, + "loss": 4.1161, + "step": 7723 + }, + { + "epoch": 0.07724, + "grad_norm": 0.5183943056733131, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 7724 + }, + { + "epoch": 0.07725, + "grad_norm": 0.544712728586757, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 7725 + }, + { + "epoch": 0.07726, + "grad_norm": 0.48659381970361326, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 7726 + }, + { + "epoch": 0.07727, + "grad_norm": 0.48012804652416147, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 7727 + }, + { + "epoch": 0.07728, + "grad_norm": 0.611442740466274, + "learning_rate": 0.003, + "loss": 4.084, + "step": 7728 + }, + { + "epoch": 0.07729, + "grad_norm": 0.7927419006950845, + "learning_rate": 0.003, + "loss": 4.115, + "step": 7729 + }, + { + "epoch": 0.0773, + "grad_norm": 1.0647142259813651, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 7730 + }, + { + "epoch": 0.07731, + "grad_norm": 1.2555263685268876, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 7731 + }, + { + "epoch": 0.07732, + "grad_norm": 0.7349062778385043, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 7732 + }, + { + "epoch": 0.07733, + "grad_norm": 0.662067738555623, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 7733 + }, + { + "epoch": 0.07734, + "grad_norm": 0.6622088276312383, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 7734 + }, + { + "epoch": 0.07735, + "grad_norm": 0.7190113835925798, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 7735 + }, + { + "epoch": 0.07736, + "grad_norm": 0.779921763945562, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 7736 + }, + { + "epoch": 0.07737, + "grad_norm": 0.7062610550634124, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 7737 + }, + { + "epoch": 0.07738, + "grad_norm": 0.781831296452957, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 7738 + }, + { + "epoch": 0.07739, + "grad_norm": 0.881490679155474, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 7739 + }, + { + "epoch": 0.0774, + "grad_norm": 0.9161291546076472, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 7740 + }, + { + "epoch": 0.07741, + "grad_norm": 0.9669089236249705, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 7741 + }, + { + "epoch": 0.07742, + "grad_norm": 0.8371183452364397, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 7742 + }, + { + "epoch": 0.07743, + "grad_norm": 0.6183081207697575, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 7743 + }, + { + "epoch": 0.07744, + "grad_norm": 0.6071723474314588, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 7744 + }, + { + "epoch": 0.07745, + "grad_norm": 0.6556013679404881, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 7745 + }, + { + "epoch": 0.07746, + "grad_norm": 0.6908482013298095, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 7746 + }, + { + "epoch": 0.07747, + "grad_norm": 0.7453245346858474, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 7747 + }, + { + "epoch": 0.07748, + "grad_norm": 0.779119331038319, + "learning_rate": 0.003, + "loss": 4.1375, + "step": 7748 + }, + { + "epoch": 0.07749, + "grad_norm": 0.6901079756719614, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 7749 + }, + { + "epoch": 0.0775, + "grad_norm": 0.7554593110119233, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 7750 + }, + { + "epoch": 0.07751, + "grad_norm": 0.6857296299894233, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 7751 + }, + { + "epoch": 0.07752, + "grad_norm": 0.6922633638861819, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 7752 + }, + { + "epoch": 0.07753, + "grad_norm": 0.6594486091898183, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 7753 + }, + { + "epoch": 0.07754, + "grad_norm": 0.5934224681021092, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 7754 + }, + { + "epoch": 0.07755, + "grad_norm": 0.5188793093092156, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 7755 + }, + { + "epoch": 0.07756, + "grad_norm": 0.48414191971528364, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 7756 + }, + { + "epoch": 0.07757, + "grad_norm": 0.47273269138835095, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 7757 + }, + { + "epoch": 0.07758, + "grad_norm": 0.49740262524638973, + "learning_rate": 0.003, + "loss": 4.075, + "step": 7758 + }, + { + "epoch": 0.07759, + "grad_norm": 0.5065560507825277, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 7759 + }, + { + "epoch": 0.0776, + "grad_norm": 0.5375106355045344, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 7760 + }, + { + "epoch": 0.07761, + "grad_norm": 0.5513605901023222, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 7761 + }, + { + "epoch": 0.07762, + "grad_norm": 0.48015165347830113, + "learning_rate": 0.003, + "loss": 4.085, + "step": 7762 + }, + { + "epoch": 0.07763, + "grad_norm": 0.5167765335801594, + "learning_rate": 0.003, + "loss": 4.103, + "step": 7763 + }, + { + "epoch": 0.07764, + "grad_norm": 0.6089153068745908, + "learning_rate": 0.003, + "loss": 4.111, + "step": 7764 + }, + { + "epoch": 0.07765, + "grad_norm": 0.7391949344092991, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 7765 + }, + { + "epoch": 0.07766, + "grad_norm": 1.072503112575427, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 7766 + }, + { + "epoch": 0.07767, + "grad_norm": 1.2741364151057324, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 7767 + }, + { + "epoch": 0.07768, + "grad_norm": 0.8964766694765297, + "learning_rate": 0.003, + "loss": 4.104, + "step": 7768 + }, + { + "epoch": 0.07769, + "grad_norm": 1.049149406706748, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 7769 + }, + { + "epoch": 0.0777, + "grad_norm": 0.930557278971072, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 7770 + }, + { + "epoch": 0.07771, + "grad_norm": 0.8923728869689802, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 7771 + }, + { + "epoch": 0.07772, + "grad_norm": 0.7822123730589138, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 7772 + }, + { + "epoch": 0.07773, + "grad_norm": 0.7222823816657866, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 7773 + }, + { + "epoch": 0.07774, + "grad_norm": 0.7187204559865789, + "learning_rate": 0.003, + "loss": 4.113, + "step": 7774 + }, + { + "epoch": 0.07775, + "grad_norm": 0.7480635321015907, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 7775 + }, + { + "epoch": 0.07776, + "grad_norm": 0.760612608235149, + "learning_rate": 0.003, + "loss": 4.1119, + "step": 7776 + }, + { + "epoch": 0.07777, + "grad_norm": 0.6882048442017037, + "learning_rate": 0.003, + "loss": 4.124, + "step": 7777 + }, + { + "epoch": 0.07778, + "grad_norm": 0.7510711884551454, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 7778 + }, + { + "epoch": 0.07779, + "grad_norm": 0.9393258215990358, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 7779 + }, + { + "epoch": 0.0778, + "grad_norm": 0.9513634587238046, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 7780 + }, + { + "epoch": 0.07781, + "grad_norm": 0.9481811143998107, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 7781 + }, + { + "epoch": 0.07782, + "grad_norm": 0.9203734843556167, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 7782 + }, + { + "epoch": 0.07783, + "grad_norm": 0.7932108521278691, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 7783 + }, + { + "epoch": 0.07784, + "grad_norm": 0.7260500811767607, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 7784 + }, + { + "epoch": 0.07785, + "grad_norm": 0.7585529189874307, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 7785 + }, + { + "epoch": 0.07786, + "grad_norm": 0.6739329538427627, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 7786 + }, + { + "epoch": 0.07787, + "grad_norm": 0.5288700684147263, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 7787 + }, + { + "epoch": 0.07788, + "grad_norm": 0.5605208483007748, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 7788 + }, + { + "epoch": 0.07789, + "grad_norm": 0.5560131490937277, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 7789 + }, + { + "epoch": 0.0779, + "grad_norm": 0.5403302533220166, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 7790 + }, + { + "epoch": 0.07791, + "grad_norm": 0.5930794987356798, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 7791 + }, + { + "epoch": 0.07792, + "grad_norm": 0.6273285492908405, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 7792 + }, + { + "epoch": 0.07793, + "grad_norm": 0.6286748399924618, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 7793 + }, + { + "epoch": 0.07794, + "grad_norm": 0.6714313089602904, + "learning_rate": 0.003, + "loss": 4.128, + "step": 7794 + }, + { + "epoch": 0.07795, + "grad_norm": 0.7779410911099373, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 7795 + }, + { + "epoch": 0.07796, + "grad_norm": 0.9238625316835825, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 7796 + }, + { + "epoch": 0.07797, + "grad_norm": 0.9115617993434088, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 7797 + }, + { + "epoch": 0.07798, + "grad_norm": 0.8524686648853493, + "learning_rate": 0.003, + "loss": 4.1306, + "step": 7798 + }, + { + "epoch": 0.07799, + "grad_norm": 0.6927867557827436, + "learning_rate": 0.003, + "loss": 4.129, + "step": 7799 + }, + { + "epoch": 0.078, + "grad_norm": 0.6008938961247131, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 7800 + }, + { + "epoch": 0.07801, + "grad_norm": 0.6422683191074112, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 7801 + }, + { + "epoch": 0.07802, + "grad_norm": 0.6032600741877926, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 7802 + }, + { + "epoch": 0.07803, + "grad_norm": 0.5863000786481797, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 7803 + }, + { + "epoch": 0.07804, + "grad_norm": 0.5317230097198795, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 7804 + }, + { + "epoch": 0.07805, + "grad_norm": 0.45647691543183516, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 7805 + }, + { + "epoch": 0.07806, + "grad_norm": 0.46699207392888487, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 7806 + }, + { + "epoch": 0.07807, + "grad_norm": 0.4393858321916459, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 7807 + }, + { + "epoch": 0.07808, + "grad_norm": 0.4646599647463825, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 7808 + }, + { + "epoch": 0.07809, + "grad_norm": 0.5658965842298878, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 7809 + }, + { + "epoch": 0.0781, + "grad_norm": 0.7181637740728304, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 7810 + }, + { + "epoch": 0.07811, + "grad_norm": 0.8109929763108267, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 7811 + }, + { + "epoch": 0.07812, + "grad_norm": 0.8368920397907048, + "learning_rate": 0.003, + "loss": 4.105, + "step": 7812 + }, + { + "epoch": 0.07813, + "grad_norm": 0.8903796018565326, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 7813 + }, + { + "epoch": 0.07814, + "grad_norm": 0.9722080091605632, + "learning_rate": 0.003, + "loss": 4.1525, + "step": 7814 + }, + { + "epoch": 0.07815, + "grad_norm": 0.9736337448735193, + "learning_rate": 0.003, + "loss": 4.1198, + "step": 7815 + }, + { + "epoch": 0.07816, + "grad_norm": 1.0961292869832548, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 7816 + }, + { + "epoch": 0.07817, + "grad_norm": 1.1565923993465608, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 7817 + }, + { + "epoch": 0.07818, + "grad_norm": 0.8585417973652822, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 7818 + }, + { + "epoch": 0.07819, + "grad_norm": 0.7416217398177496, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 7819 + }, + { + "epoch": 0.0782, + "grad_norm": 0.6344240220641632, + "learning_rate": 0.003, + "loss": 4.1308, + "step": 7820 + }, + { + "epoch": 0.07821, + "grad_norm": 0.6739464286664512, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 7821 + }, + { + "epoch": 0.07822, + "grad_norm": 0.5825705065071117, + "learning_rate": 0.003, + "loss": 4.082, + "step": 7822 + }, + { + "epoch": 0.07823, + "grad_norm": 0.5723600939543713, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 7823 + }, + { + "epoch": 0.07824, + "grad_norm": 0.5121389736529045, + "learning_rate": 0.003, + "loss": 4.1241, + "step": 7824 + }, + { + "epoch": 0.07825, + "grad_norm": 0.5355001404670079, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 7825 + }, + { + "epoch": 0.07826, + "grad_norm": 0.527193374309933, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 7826 + }, + { + "epoch": 0.07827, + "grad_norm": 0.46903753993293595, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 7827 + }, + { + "epoch": 0.07828, + "grad_norm": 0.5421978536054349, + "learning_rate": 0.003, + "loss": 4.107, + "step": 7828 + }, + { + "epoch": 0.07829, + "grad_norm": 0.5249081779647955, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 7829 + }, + { + "epoch": 0.0783, + "grad_norm": 0.5049597468526409, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 7830 + }, + { + "epoch": 0.07831, + "grad_norm": 0.5692015688709412, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 7831 + }, + { + "epoch": 0.07832, + "grad_norm": 0.6743332070780026, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 7832 + }, + { + "epoch": 0.07833, + "grad_norm": 0.7173712685283925, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 7833 + }, + { + "epoch": 0.07834, + "grad_norm": 0.8370657093807944, + "learning_rate": 0.003, + "loss": 4.149, + "step": 7834 + }, + { + "epoch": 0.07835, + "grad_norm": 1.136204980987355, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 7835 + }, + { + "epoch": 0.07836, + "grad_norm": 0.8577818143218462, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 7836 + }, + { + "epoch": 0.07837, + "grad_norm": 0.7112654311205949, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 7837 + }, + { + "epoch": 0.07838, + "grad_norm": 0.6569683165529128, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 7838 + }, + { + "epoch": 0.07839, + "grad_norm": 0.7565840760711471, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 7839 + }, + { + "epoch": 0.0784, + "grad_norm": 0.7482225327831992, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 7840 + }, + { + "epoch": 0.07841, + "grad_norm": 0.6711480993377672, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 7841 + }, + { + "epoch": 0.07842, + "grad_norm": 0.6679564294971032, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 7842 + }, + { + "epoch": 0.07843, + "grad_norm": 0.59249526868013, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 7843 + }, + { + "epoch": 0.07844, + "grad_norm": 0.5591695681252433, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 7844 + }, + { + "epoch": 0.07845, + "grad_norm": 0.6153612137689568, + "learning_rate": 0.003, + "loss": 4.1094, + "step": 7845 + }, + { + "epoch": 0.07846, + "grad_norm": 0.6618403605569363, + "learning_rate": 0.003, + "loss": 4.1274, + "step": 7846 + }, + { + "epoch": 0.07847, + "grad_norm": 0.6781576972563144, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 7847 + }, + { + "epoch": 0.07848, + "grad_norm": 0.6768216849272073, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 7848 + }, + { + "epoch": 0.07849, + "grad_norm": 0.7621052101869075, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 7849 + }, + { + "epoch": 0.0785, + "grad_norm": 0.7067170147953223, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 7850 + }, + { + "epoch": 0.07851, + "grad_norm": 0.7326500117988808, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 7851 + }, + { + "epoch": 0.07852, + "grad_norm": 0.7009642585608814, + "learning_rate": 0.003, + "loss": 4.1261, + "step": 7852 + }, + { + "epoch": 0.07853, + "grad_norm": 0.7619033300792631, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 7853 + }, + { + "epoch": 0.07854, + "grad_norm": 0.8435597472657554, + "learning_rate": 0.003, + "loss": 4.1189, + "step": 7854 + }, + { + "epoch": 0.07855, + "grad_norm": 0.8978457164901018, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 7855 + }, + { + "epoch": 0.07856, + "grad_norm": 1.023677440950207, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 7856 + }, + { + "epoch": 0.07857, + "grad_norm": 0.9231166097330715, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 7857 + }, + { + "epoch": 0.07858, + "grad_norm": 0.8633661145024862, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 7858 + }, + { + "epoch": 0.07859, + "grad_norm": 0.7860454695774618, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 7859 + }, + { + "epoch": 0.0786, + "grad_norm": 0.7590473523974762, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 7860 + }, + { + "epoch": 0.07861, + "grad_norm": 0.7742422968395642, + "learning_rate": 0.003, + "loss": 4.1255, + "step": 7861 + }, + { + "epoch": 0.07862, + "grad_norm": 0.6647634614520403, + "learning_rate": 0.003, + "loss": 4.1421, + "step": 7862 + }, + { + "epoch": 0.07863, + "grad_norm": 0.6187463355818706, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 7863 + }, + { + "epoch": 0.07864, + "grad_norm": 0.5110625417422245, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 7864 + }, + { + "epoch": 0.07865, + "grad_norm": 0.5020168122945233, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 7865 + }, + { + "epoch": 0.07866, + "grad_norm": 0.5824358186306762, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 7866 + }, + { + "epoch": 0.07867, + "grad_norm": 0.6255923217504209, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 7867 + }, + { + "epoch": 0.07868, + "grad_norm": 0.7772954380441937, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 7868 + }, + { + "epoch": 0.07869, + "grad_norm": 0.9860133943188509, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 7869 + }, + { + "epoch": 0.0787, + "grad_norm": 1.0195783394153966, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 7870 + }, + { + "epoch": 0.07871, + "grad_norm": 0.7559459514710412, + "learning_rate": 0.003, + "loss": 4.1424, + "step": 7871 + }, + { + "epoch": 0.07872, + "grad_norm": 0.9076157878017646, + "learning_rate": 0.003, + "loss": 4.1438, + "step": 7872 + }, + { + "epoch": 0.07873, + "grad_norm": 0.8226175326832436, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 7873 + }, + { + "epoch": 0.07874, + "grad_norm": 0.7406414466218076, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 7874 + }, + { + "epoch": 0.07875, + "grad_norm": 0.7388682612769197, + "learning_rate": 0.003, + "loss": 4.1222, + "step": 7875 + }, + { + "epoch": 0.07876, + "grad_norm": 0.904519114854065, + "learning_rate": 0.003, + "loss": 4.127, + "step": 7876 + }, + { + "epoch": 0.07877, + "grad_norm": 0.9368363860931168, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 7877 + }, + { + "epoch": 0.07878, + "grad_norm": 0.8598751903328211, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 7878 + }, + { + "epoch": 0.07879, + "grad_norm": 0.819312740767117, + "learning_rate": 0.003, + "loss": 4.1204, + "step": 7879 + }, + { + "epoch": 0.0788, + "grad_norm": 0.7760893771538491, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 7880 + }, + { + "epoch": 0.07881, + "grad_norm": 0.7187534747264536, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 7881 + }, + { + "epoch": 0.07882, + "grad_norm": 0.6858619284030656, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 7882 + }, + { + "epoch": 0.07883, + "grad_norm": 0.6188431011405061, + "learning_rate": 0.003, + "loss": 4.083, + "step": 7883 + }, + { + "epoch": 0.07884, + "grad_norm": 0.49119386209036325, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 7884 + }, + { + "epoch": 0.07885, + "grad_norm": 0.4425644032822491, + "learning_rate": 0.003, + "loss": 4.088, + "step": 7885 + }, + { + "epoch": 0.07886, + "grad_norm": 0.45859576158776894, + "learning_rate": 0.003, + "loss": 4.1142, + "step": 7886 + }, + { + "epoch": 0.07887, + "grad_norm": 0.4966339618463925, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 7887 + }, + { + "epoch": 0.07888, + "grad_norm": 0.5978659033312088, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 7888 + }, + { + "epoch": 0.07889, + "grad_norm": 0.7155472899570747, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 7889 + }, + { + "epoch": 0.0789, + "grad_norm": 0.7993574365094266, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 7890 + }, + { + "epoch": 0.07891, + "grad_norm": 0.7636085550101828, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 7891 + }, + { + "epoch": 0.07892, + "grad_norm": 0.6885425050961269, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 7892 + }, + { + "epoch": 0.07893, + "grad_norm": 0.6721526677142209, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 7893 + }, + { + "epoch": 0.07894, + "grad_norm": 0.6152441288738106, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 7894 + }, + { + "epoch": 0.07895, + "grad_norm": 0.561256579962152, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 7895 + }, + { + "epoch": 0.07896, + "grad_norm": 0.5735961480085187, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 7896 + }, + { + "epoch": 0.07897, + "grad_norm": 0.676934691180459, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 7897 + }, + { + "epoch": 0.07898, + "grad_norm": 0.7401825684646072, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 7898 + }, + { + "epoch": 0.07899, + "grad_norm": 0.7621955625823885, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 7899 + }, + { + "epoch": 0.079, + "grad_norm": 0.7808977446083992, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 7900 + }, + { + "epoch": 0.07901, + "grad_norm": 0.8881664429218848, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 7901 + }, + { + "epoch": 0.07902, + "grad_norm": 0.8885276958518861, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 7902 + }, + { + "epoch": 0.07903, + "grad_norm": 0.8190648495783674, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 7903 + }, + { + "epoch": 0.07904, + "grad_norm": 0.7646472367126418, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 7904 + }, + { + "epoch": 0.07905, + "grad_norm": 0.768832626042469, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 7905 + }, + { + "epoch": 0.07906, + "grad_norm": 0.7344374945822847, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 7906 + }, + { + "epoch": 0.07907, + "grad_norm": 0.5806783775411604, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 7907 + }, + { + "epoch": 0.07908, + "grad_norm": 0.5703784952596479, + "learning_rate": 0.003, + "loss": 4.107, + "step": 7908 + }, + { + "epoch": 0.07909, + "grad_norm": 0.5524374992586529, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 7909 + }, + { + "epoch": 0.0791, + "grad_norm": 0.593207520021952, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 7910 + }, + { + "epoch": 0.07911, + "grad_norm": 0.6796537564703014, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 7911 + }, + { + "epoch": 0.07912, + "grad_norm": 0.811473537909756, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 7912 + }, + { + "epoch": 0.07913, + "grad_norm": 0.9594020178558872, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 7913 + }, + { + "epoch": 0.07914, + "grad_norm": 0.8781644497544638, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 7914 + }, + { + "epoch": 0.07915, + "grad_norm": 0.7673157421848797, + "learning_rate": 0.003, + "loss": 4.086, + "step": 7915 + }, + { + "epoch": 0.07916, + "grad_norm": 0.7231720388780313, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 7916 + }, + { + "epoch": 0.07917, + "grad_norm": 0.747995150418676, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 7917 + }, + { + "epoch": 0.07918, + "grad_norm": 0.7558357775396138, + "learning_rate": 0.003, + "loss": 4.106, + "step": 7918 + }, + { + "epoch": 0.07919, + "grad_norm": 0.7473819497926562, + "learning_rate": 0.003, + "loss": 4.065, + "step": 7919 + }, + { + "epoch": 0.0792, + "grad_norm": 0.7630023142885092, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 7920 + }, + { + "epoch": 0.07921, + "grad_norm": 0.661482132358115, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 7921 + }, + { + "epoch": 0.07922, + "grad_norm": 0.6198116791527155, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 7922 + }, + { + "epoch": 0.07923, + "grad_norm": 0.6551552600939399, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 7923 + }, + { + "epoch": 0.07924, + "grad_norm": 0.6696835280064822, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 7924 + }, + { + "epoch": 0.07925, + "grad_norm": 0.6925330545340292, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 7925 + }, + { + "epoch": 0.07926, + "grad_norm": 0.7284626249714012, + "learning_rate": 0.003, + "loss": 4.1198, + "step": 7926 + }, + { + "epoch": 0.07927, + "grad_norm": 0.671892879773423, + "learning_rate": 0.003, + "loss": 4.062, + "step": 7927 + }, + { + "epoch": 0.07928, + "grad_norm": 0.6609056217998242, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 7928 + }, + { + "epoch": 0.07929, + "grad_norm": 0.6999670786024481, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 7929 + }, + { + "epoch": 0.0793, + "grad_norm": 0.6958735375819932, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 7930 + }, + { + "epoch": 0.07931, + "grad_norm": 0.7554404336036357, + "learning_rate": 0.003, + "loss": 4.1153, + "step": 7931 + }, + { + "epoch": 0.07932, + "grad_norm": 0.8243101778229396, + "learning_rate": 0.003, + "loss": 4.1151, + "step": 7932 + }, + { + "epoch": 0.07933, + "grad_norm": 0.9463359506341771, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 7933 + }, + { + "epoch": 0.07934, + "grad_norm": 1.013600457137422, + "learning_rate": 0.003, + "loss": 4.1352, + "step": 7934 + }, + { + "epoch": 0.07935, + "grad_norm": 0.8419500712154685, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 7935 + }, + { + "epoch": 0.07936, + "grad_norm": 0.6245467403233166, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 7936 + }, + { + "epoch": 0.07937, + "grad_norm": 0.7486505766341088, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 7937 + }, + { + "epoch": 0.07938, + "grad_norm": 0.8254464470107613, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 7938 + }, + { + "epoch": 0.07939, + "grad_norm": 0.8503311527457658, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 7939 + }, + { + "epoch": 0.0794, + "grad_norm": 0.7668471916028556, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 7940 + }, + { + "epoch": 0.07941, + "grad_norm": 0.6283204867133623, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 7941 + }, + { + "epoch": 0.07942, + "grad_norm": 0.5166138501693774, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 7942 + }, + { + "epoch": 0.07943, + "grad_norm": 0.5021949697098655, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 7943 + }, + { + "epoch": 0.07944, + "grad_norm": 0.5349404843772013, + "learning_rate": 0.003, + "loss": 4.121, + "step": 7944 + }, + { + "epoch": 0.07945, + "grad_norm": 0.5952661363698953, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 7945 + }, + { + "epoch": 0.07946, + "grad_norm": 0.5669655747114686, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 7946 + }, + { + "epoch": 0.07947, + "grad_norm": 0.5763265373404088, + "learning_rate": 0.003, + "loss": 4.1122, + "step": 7947 + }, + { + "epoch": 0.07948, + "grad_norm": 0.5722110776640312, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 7948 + }, + { + "epoch": 0.07949, + "grad_norm": 0.6056595909010904, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 7949 + }, + { + "epoch": 0.0795, + "grad_norm": 0.6647078565086577, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 7950 + }, + { + "epoch": 0.07951, + "grad_norm": 0.6314215964588701, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 7951 + }, + { + "epoch": 0.07952, + "grad_norm": 0.5731615532451197, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 7952 + }, + { + "epoch": 0.07953, + "grad_norm": 0.5227418275474421, + "learning_rate": 0.003, + "loss": 4.066, + "step": 7953 + }, + { + "epoch": 0.07954, + "grad_norm": 0.5425211491008127, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 7954 + }, + { + "epoch": 0.07955, + "grad_norm": 0.5562460949888285, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 7955 + }, + { + "epoch": 0.07956, + "grad_norm": 0.5957051466560916, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 7956 + }, + { + "epoch": 0.07957, + "grad_norm": 0.6056607635018184, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 7957 + }, + { + "epoch": 0.07958, + "grad_norm": 0.7512975150926531, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 7958 + }, + { + "epoch": 0.07959, + "grad_norm": 1.0080192207485463, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 7959 + }, + { + "epoch": 0.0796, + "grad_norm": 1.2487463146426276, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 7960 + }, + { + "epoch": 0.07961, + "grad_norm": 0.6826790561980052, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 7961 + }, + { + "epoch": 0.07962, + "grad_norm": 0.7451214520986289, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 7962 + }, + { + "epoch": 0.07963, + "grad_norm": 0.9631217756846927, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 7963 + }, + { + "epoch": 0.07964, + "grad_norm": 1.1941093560148814, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 7964 + }, + { + "epoch": 0.07965, + "grad_norm": 0.7914311934718551, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 7965 + }, + { + "epoch": 0.07966, + "grad_norm": 0.6290523078446999, + "learning_rate": 0.003, + "loss": 4.087, + "step": 7966 + }, + { + "epoch": 0.07967, + "grad_norm": 0.6518156345361048, + "learning_rate": 0.003, + "loss": 4.111, + "step": 7967 + }, + { + "epoch": 0.07968, + "grad_norm": 0.6214813237738295, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 7968 + }, + { + "epoch": 0.07969, + "grad_norm": 0.5554977099941081, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 7969 + }, + { + "epoch": 0.0797, + "grad_norm": 0.5640600564781908, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 7970 + }, + { + "epoch": 0.07971, + "grad_norm": 0.6596519402139773, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 7971 + }, + { + "epoch": 0.07972, + "grad_norm": 0.8545861020949079, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 7972 + }, + { + "epoch": 0.07973, + "grad_norm": 0.865871265631871, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 7973 + }, + { + "epoch": 0.07974, + "grad_norm": 0.7460236260461683, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 7974 + }, + { + "epoch": 0.07975, + "grad_norm": 0.6850055866586392, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 7975 + }, + { + "epoch": 0.07976, + "grad_norm": 0.6735013134508674, + "learning_rate": 0.003, + "loss": 4.1095, + "step": 7976 + }, + { + "epoch": 0.07977, + "grad_norm": 0.7405183514912751, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 7977 + }, + { + "epoch": 0.07978, + "grad_norm": 0.8290886310120359, + "learning_rate": 0.003, + "loss": 4.1145, + "step": 7978 + }, + { + "epoch": 0.07979, + "grad_norm": 0.8453917594571124, + "learning_rate": 0.003, + "loss": 4.093, + "step": 7979 + }, + { + "epoch": 0.0798, + "grad_norm": 0.8981121368882824, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 7980 + }, + { + "epoch": 0.07981, + "grad_norm": 0.8246540871692223, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 7981 + }, + { + "epoch": 0.07982, + "grad_norm": 0.8765802803687193, + "learning_rate": 0.003, + "loss": 4.1169, + "step": 7982 + }, + { + "epoch": 0.07983, + "grad_norm": 0.9675677298543592, + "learning_rate": 0.003, + "loss": 4.1423, + "step": 7983 + }, + { + "epoch": 0.07984, + "grad_norm": 1.0989989237488744, + "learning_rate": 0.003, + "loss": 4.1297, + "step": 7984 + }, + { + "epoch": 0.07985, + "grad_norm": 0.865566009696521, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 7985 + }, + { + "epoch": 0.07986, + "grad_norm": 0.7217554053695046, + "learning_rate": 0.003, + "loss": 4.1241, + "step": 7986 + }, + { + "epoch": 0.07987, + "grad_norm": 0.7223309169228254, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 7987 + }, + { + "epoch": 0.07988, + "grad_norm": 0.7601248914831606, + "learning_rate": 0.003, + "loss": 4.111, + "step": 7988 + }, + { + "epoch": 0.07989, + "grad_norm": 0.754472367102963, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 7989 + }, + { + "epoch": 0.0799, + "grad_norm": 0.7962535081025417, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 7990 + }, + { + "epoch": 0.07991, + "grad_norm": 0.7457639189267391, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 7991 + }, + { + "epoch": 0.07992, + "grad_norm": 0.6851938375119424, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 7992 + }, + { + "epoch": 0.07993, + "grad_norm": 0.6269563237493135, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 7993 + }, + { + "epoch": 0.07994, + "grad_norm": 0.6070969865870026, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 7994 + }, + { + "epoch": 0.07995, + "grad_norm": 0.5627702965519503, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 7995 + }, + { + "epoch": 0.07996, + "grad_norm": 0.5112924542761613, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 7996 + }, + { + "epoch": 0.07997, + "grad_norm": 0.598007231362175, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 7997 + }, + { + "epoch": 0.07998, + "grad_norm": 0.6410257731351261, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 7998 + }, + { + "epoch": 0.07999, + "grad_norm": 0.7458379686223753, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 7999 + }, + { + "epoch": 0.08, + "grad_norm": 0.7034092807624223, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 8000 + }, + { + "epoch": 0.08001, + "grad_norm": 0.5871105361909563, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 8001 + }, + { + "epoch": 0.08002, + "grad_norm": 0.5995732080621035, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 8002 + }, + { + "epoch": 0.08003, + "grad_norm": 0.6613967789238772, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 8003 + }, + { + "epoch": 0.08004, + "grad_norm": 0.6523754319361283, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 8004 + }, + { + "epoch": 0.08005, + "grad_norm": 0.7036409698174545, + "learning_rate": 0.003, + "loss": 4.1115, + "step": 8005 + }, + { + "epoch": 0.08006, + "grad_norm": 0.7212028299745142, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 8006 + }, + { + "epoch": 0.08007, + "grad_norm": 0.6666231743781851, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 8007 + }, + { + "epoch": 0.08008, + "grad_norm": 0.7849422476214978, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 8008 + }, + { + "epoch": 0.08009, + "grad_norm": 0.8161881054836299, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 8009 + }, + { + "epoch": 0.0801, + "grad_norm": 0.7682572142792697, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 8010 + }, + { + "epoch": 0.08011, + "grad_norm": 0.777422634390519, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 8011 + }, + { + "epoch": 0.08012, + "grad_norm": 0.8100758516563243, + "learning_rate": 0.003, + "loss": 4.1017, + "step": 8012 + }, + { + "epoch": 0.08013, + "grad_norm": 0.7710777007779627, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 8013 + }, + { + "epoch": 0.08014, + "grad_norm": 0.7527995072769255, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 8014 + }, + { + "epoch": 0.08015, + "grad_norm": 0.7561651584547401, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 8015 + }, + { + "epoch": 0.08016, + "grad_norm": 0.7035897971730628, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 8016 + }, + { + "epoch": 0.08017, + "grad_norm": 0.7174150406944461, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 8017 + }, + { + "epoch": 0.08018, + "grad_norm": 0.7624585759410506, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 8018 + }, + { + "epoch": 0.08019, + "grad_norm": 0.7182168882426316, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 8019 + }, + { + "epoch": 0.0802, + "grad_norm": 0.7048358879364466, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 8020 + }, + { + "epoch": 0.08021, + "grad_norm": 0.7497958878822685, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 8021 + }, + { + "epoch": 0.08022, + "grad_norm": 0.821004036800071, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 8022 + }, + { + "epoch": 0.08023, + "grad_norm": 0.9809693423098361, + "learning_rate": 0.003, + "loss": 4.1, + "step": 8023 + }, + { + "epoch": 0.08024, + "grad_norm": 1.052787459999515, + "learning_rate": 0.003, + "loss": 4.1229, + "step": 8024 + }, + { + "epoch": 0.08025, + "grad_norm": 0.8450811401393215, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 8025 + }, + { + "epoch": 0.08026, + "grad_norm": 0.807742847907378, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 8026 + }, + { + "epoch": 0.08027, + "grad_norm": 0.7663131570160219, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 8027 + }, + { + "epoch": 0.08028, + "grad_norm": 0.6840081645216585, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 8028 + }, + { + "epoch": 0.08029, + "grad_norm": 0.6282444638490934, + "learning_rate": 0.003, + "loss": 4.118, + "step": 8029 + }, + { + "epoch": 0.0803, + "grad_norm": 0.5930173859419846, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 8030 + }, + { + "epoch": 0.08031, + "grad_norm": 0.6043437323260166, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 8031 + }, + { + "epoch": 0.08032, + "grad_norm": 0.6211595627768416, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 8032 + }, + { + "epoch": 0.08033, + "grad_norm": 0.6326856189776532, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 8033 + }, + { + "epoch": 0.08034, + "grad_norm": 0.7414480977291886, + "learning_rate": 0.003, + "loss": 4.112, + "step": 8034 + }, + { + "epoch": 0.08035, + "grad_norm": 0.847555227207347, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 8035 + }, + { + "epoch": 0.08036, + "grad_norm": 0.8976992992805168, + "learning_rate": 0.003, + "loss": 4.1415, + "step": 8036 + }, + { + "epoch": 0.08037, + "grad_norm": 0.7353446869433018, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 8037 + }, + { + "epoch": 0.08038, + "grad_norm": 0.5737562105586347, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 8038 + }, + { + "epoch": 0.08039, + "grad_norm": 0.6223395377390066, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 8039 + }, + { + "epoch": 0.0804, + "grad_norm": 0.6460407278049516, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 8040 + }, + { + "epoch": 0.08041, + "grad_norm": 0.6792567658949662, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 8041 + }, + { + "epoch": 0.08042, + "grad_norm": 0.6631671232762439, + "learning_rate": 0.003, + "loss": 4.098, + "step": 8042 + }, + { + "epoch": 0.08043, + "grad_norm": 0.7543589264318384, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 8043 + }, + { + "epoch": 0.08044, + "grad_norm": 0.8201149824912215, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 8044 + }, + { + "epoch": 0.08045, + "grad_norm": 0.8899252198530396, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 8045 + }, + { + "epoch": 0.08046, + "grad_norm": 0.8349673707656354, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 8046 + }, + { + "epoch": 0.08047, + "grad_norm": 0.7510934507359202, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 8047 + }, + { + "epoch": 0.08048, + "grad_norm": 0.7197955096059386, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 8048 + }, + { + "epoch": 0.08049, + "grad_norm": 0.6884550531131161, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 8049 + }, + { + "epoch": 0.0805, + "grad_norm": 0.6875283286820179, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 8050 + }, + { + "epoch": 0.08051, + "grad_norm": 0.6837157282878973, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 8051 + }, + { + "epoch": 0.08052, + "grad_norm": 0.6102356496634533, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 8052 + }, + { + "epoch": 0.08053, + "grad_norm": 0.5591025845354667, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 8053 + }, + { + "epoch": 0.08054, + "grad_norm": 0.6059431986010548, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 8054 + }, + { + "epoch": 0.08055, + "grad_norm": 0.5808217412748479, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 8055 + }, + { + "epoch": 0.08056, + "grad_norm": 0.6445970901420555, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 8056 + }, + { + "epoch": 0.08057, + "grad_norm": 0.6726343017392604, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 8057 + }, + { + "epoch": 0.08058, + "grad_norm": 0.5999523335836248, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 8058 + }, + { + "epoch": 0.08059, + "grad_norm": 0.6543038111857801, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 8059 + }, + { + "epoch": 0.0806, + "grad_norm": 0.6982671331368794, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 8060 + }, + { + "epoch": 0.08061, + "grad_norm": 0.7521101541978558, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 8061 + }, + { + "epoch": 0.08062, + "grad_norm": 0.8023563510336984, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 8062 + }, + { + "epoch": 0.08063, + "grad_norm": 0.8276610456091178, + "learning_rate": 0.003, + "loss": 4.134, + "step": 8063 + }, + { + "epoch": 0.08064, + "grad_norm": 0.9822043974280728, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 8064 + }, + { + "epoch": 0.08065, + "grad_norm": 1.1637487862676754, + "learning_rate": 0.003, + "loss": 4.1244, + "step": 8065 + }, + { + "epoch": 0.08066, + "grad_norm": 0.9728114020342268, + "learning_rate": 0.003, + "loss": 4.136, + "step": 8066 + }, + { + "epoch": 0.08067, + "grad_norm": 0.9136508947005387, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 8067 + }, + { + "epoch": 0.08068, + "grad_norm": 1.069372223697199, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 8068 + }, + { + "epoch": 0.08069, + "grad_norm": 1.045484111398971, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 8069 + }, + { + "epoch": 0.0807, + "grad_norm": 0.9148751637475432, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 8070 + }, + { + "epoch": 0.08071, + "grad_norm": 0.7828647369529286, + "learning_rate": 0.003, + "loss": 4.1095, + "step": 8071 + }, + { + "epoch": 0.08072, + "grad_norm": 0.6962246302015118, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 8072 + }, + { + "epoch": 0.08073, + "grad_norm": 0.7338615119494287, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 8073 + }, + { + "epoch": 0.08074, + "grad_norm": 0.7372129834579164, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 8074 + }, + { + "epoch": 0.08075, + "grad_norm": 0.6637992164097991, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 8075 + }, + { + "epoch": 0.08076, + "grad_norm": 0.7364712485983946, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 8076 + }, + { + "epoch": 0.08077, + "grad_norm": 0.8373976570049751, + "learning_rate": 0.003, + "loss": 4.1347, + "step": 8077 + }, + { + "epoch": 0.08078, + "grad_norm": 0.8081955608957938, + "learning_rate": 0.003, + "loss": 4.119, + "step": 8078 + }, + { + "epoch": 0.08079, + "grad_norm": 0.6241082470180894, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 8079 + }, + { + "epoch": 0.0808, + "grad_norm": 0.5925082920698637, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 8080 + }, + { + "epoch": 0.08081, + "grad_norm": 0.5771081693438284, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 8081 + }, + { + "epoch": 0.08082, + "grad_norm": 0.5644671641528122, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 8082 + }, + { + "epoch": 0.08083, + "grad_norm": 0.5574560207893343, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 8083 + }, + { + "epoch": 0.08084, + "grad_norm": 0.5516245286767989, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 8084 + }, + { + "epoch": 0.08085, + "grad_norm": 0.4994799258132661, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 8085 + }, + { + "epoch": 0.08086, + "grad_norm": 0.5387898877477255, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 8086 + }, + { + "epoch": 0.08087, + "grad_norm": 0.5279461986880152, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 8087 + }, + { + "epoch": 0.08088, + "grad_norm": 0.6653142602263723, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 8088 + }, + { + "epoch": 0.08089, + "grad_norm": 0.7838979326031138, + "learning_rate": 0.003, + "loss": 4.063, + "step": 8089 + }, + { + "epoch": 0.0809, + "grad_norm": 0.7172870093397796, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 8090 + }, + { + "epoch": 0.08091, + "grad_norm": 0.6970224864376466, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 8091 + }, + { + "epoch": 0.08092, + "grad_norm": 0.8698816808081968, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 8092 + }, + { + "epoch": 0.08093, + "grad_norm": 0.8903936647822183, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 8093 + }, + { + "epoch": 0.08094, + "grad_norm": 0.8170859896883738, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 8094 + }, + { + "epoch": 0.08095, + "grad_norm": 0.630198435688837, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 8095 + }, + { + "epoch": 0.08096, + "grad_norm": 0.561670164851885, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 8096 + }, + { + "epoch": 0.08097, + "grad_norm": 0.6003643063916327, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 8097 + }, + { + "epoch": 0.08098, + "grad_norm": 0.5928995388219754, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 8098 + }, + { + "epoch": 0.08099, + "grad_norm": 0.613396859162041, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 8099 + }, + { + "epoch": 0.081, + "grad_norm": 0.5937607661582928, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 8100 + }, + { + "epoch": 0.08101, + "grad_norm": 0.5633649505477989, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 8101 + }, + { + "epoch": 0.08102, + "grad_norm": 0.517609472371696, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 8102 + }, + { + "epoch": 0.08103, + "grad_norm": 0.5099084616246116, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 8103 + }, + { + "epoch": 0.08104, + "grad_norm": 0.5423305382438013, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 8104 + }, + { + "epoch": 0.08105, + "grad_norm": 0.5677547112279588, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 8105 + }, + { + "epoch": 0.08106, + "grad_norm": 0.6327286014925814, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 8106 + }, + { + "epoch": 0.08107, + "grad_norm": 0.5935405615580722, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 8107 + }, + { + "epoch": 0.08108, + "grad_norm": 0.5906936959536371, + "learning_rate": 0.003, + "loss": 4.041, + "step": 8108 + }, + { + "epoch": 0.08109, + "grad_norm": 0.7713130902488691, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 8109 + }, + { + "epoch": 0.0811, + "grad_norm": 1.089819423816047, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 8110 + }, + { + "epoch": 0.08111, + "grad_norm": 0.9827588739551303, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 8111 + }, + { + "epoch": 0.08112, + "grad_norm": 1.0089600330173312, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 8112 + }, + { + "epoch": 0.08113, + "grad_norm": 0.9171567562711044, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 8113 + }, + { + "epoch": 0.08114, + "grad_norm": 0.8631573233521649, + "learning_rate": 0.003, + "loss": 4.127, + "step": 8114 + }, + { + "epoch": 0.08115, + "grad_norm": 0.8166838272437511, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 8115 + }, + { + "epoch": 0.08116, + "grad_norm": 0.788874725717097, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 8116 + }, + { + "epoch": 0.08117, + "grad_norm": 0.7767336993314875, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 8117 + }, + { + "epoch": 0.08118, + "grad_norm": 0.9294129096540645, + "learning_rate": 0.003, + "loss": 4.1132, + "step": 8118 + }, + { + "epoch": 0.08119, + "grad_norm": 0.8162169856495585, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 8119 + }, + { + "epoch": 0.0812, + "grad_norm": 0.8009999394525374, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 8120 + }, + { + "epoch": 0.08121, + "grad_norm": 0.8865514748225773, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 8121 + }, + { + "epoch": 0.08122, + "grad_norm": 0.9884968101525333, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 8122 + }, + { + "epoch": 0.08123, + "grad_norm": 1.0369283181463518, + "learning_rate": 0.003, + "loss": 4.1164, + "step": 8123 + }, + { + "epoch": 0.08124, + "grad_norm": 0.7962033997430238, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 8124 + }, + { + "epoch": 0.08125, + "grad_norm": 0.6823614488148949, + "learning_rate": 0.003, + "loss": 4.1377, + "step": 8125 + }, + { + "epoch": 0.08126, + "grad_norm": 0.7327655636296029, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 8126 + }, + { + "epoch": 0.08127, + "grad_norm": 0.7378713020854091, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 8127 + }, + { + "epoch": 0.08128, + "grad_norm": 0.7764634911842385, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 8128 + }, + { + "epoch": 0.08129, + "grad_norm": 0.767500774593417, + "learning_rate": 0.003, + "loss": 4.091, + "step": 8129 + }, + { + "epoch": 0.0813, + "grad_norm": 0.7754642716859383, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 8130 + }, + { + "epoch": 0.08131, + "grad_norm": 0.7938632639450202, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 8131 + }, + { + "epoch": 0.08132, + "grad_norm": 0.7890658387507653, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 8132 + }, + { + "epoch": 0.08133, + "grad_norm": 0.7935294120537186, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 8133 + }, + { + "epoch": 0.08134, + "grad_norm": 0.8391989214547845, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 8134 + }, + { + "epoch": 0.08135, + "grad_norm": 0.9554035900858945, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 8135 + }, + { + "epoch": 0.08136, + "grad_norm": 0.9663343129111622, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 8136 + }, + { + "epoch": 0.08137, + "grad_norm": 0.8734638206822923, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 8137 + }, + { + "epoch": 0.08138, + "grad_norm": 0.9659073383764333, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 8138 + }, + { + "epoch": 0.08139, + "grad_norm": 1.0174128187111928, + "learning_rate": 0.003, + "loss": 4.1259, + "step": 8139 + }, + { + "epoch": 0.0814, + "grad_norm": 0.8902331333269284, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 8140 + }, + { + "epoch": 0.08141, + "grad_norm": 0.8349574760386372, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 8141 + }, + { + "epoch": 0.08142, + "grad_norm": 0.7575245098644054, + "learning_rate": 0.003, + "loss": 4.1379, + "step": 8142 + }, + { + "epoch": 0.08143, + "grad_norm": 0.8507077034723728, + "learning_rate": 0.003, + "loss": 4.1328, + "step": 8143 + }, + { + "epoch": 0.08144, + "grad_norm": 0.9743221963844606, + "learning_rate": 0.003, + "loss": 4.107, + "step": 8144 + }, + { + "epoch": 0.08145, + "grad_norm": 0.9476119009807066, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 8145 + }, + { + "epoch": 0.08146, + "grad_norm": 0.9002626878014724, + "learning_rate": 0.003, + "loss": 4.1448, + "step": 8146 + }, + { + "epoch": 0.08147, + "grad_norm": 0.8588231090576398, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 8147 + }, + { + "epoch": 0.08148, + "grad_norm": 0.7514699828415461, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 8148 + }, + { + "epoch": 0.08149, + "grad_norm": 0.6673420124266218, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 8149 + }, + { + "epoch": 0.0815, + "grad_norm": 0.7038005384460582, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 8150 + }, + { + "epoch": 0.08151, + "grad_norm": 0.7264508313079431, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 8151 + }, + { + "epoch": 0.08152, + "grad_norm": 0.8385234903173766, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 8152 + }, + { + "epoch": 0.08153, + "grad_norm": 0.8772929826726037, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 8153 + }, + { + "epoch": 0.08154, + "grad_norm": 0.773933052101485, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 8154 + }, + { + "epoch": 0.08155, + "grad_norm": 0.5418340291312099, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 8155 + }, + { + "epoch": 0.08156, + "grad_norm": 0.612247602405888, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 8156 + }, + { + "epoch": 0.08157, + "grad_norm": 0.6995306526237685, + "learning_rate": 0.003, + "loss": 4.054, + "step": 8157 + }, + { + "epoch": 0.08158, + "grad_norm": 0.6914288085424243, + "learning_rate": 0.003, + "loss": 4.1327, + "step": 8158 + }, + { + "epoch": 0.08159, + "grad_norm": 0.5639754454368973, + "learning_rate": 0.003, + "loss": 4.126, + "step": 8159 + }, + { + "epoch": 0.0816, + "grad_norm": 0.5308643800912878, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 8160 + }, + { + "epoch": 0.08161, + "grad_norm": 0.4794329031430582, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 8161 + }, + { + "epoch": 0.08162, + "grad_norm": 0.3941630661211312, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 8162 + }, + { + "epoch": 0.08163, + "grad_norm": 0.3500016460344925, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 8163 + }, + { + "epoch": 0.08164, + "grad_norm": 0.36562869444919177, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 8164 + }, + { + "epoch": 0.08165, + "grad_norm": 0.43629694271249264, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 8165 + }, + { + "epoch": 0.08166, + "grad_norm": 0.5708500396948257, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 8166 + }, + { + "epoch": 0.08167, + "grad_norm": 0.7973454394619072, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 8167 + }, + { + "epoch": 0.08168, + "grad_norm": 1.0166589750002435, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 8168 + }, + { + "epoch": 0.08169, + "grad_norm": 1.0549264180537317, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 8169 + }, + { + "epoch": 0.0817, + "grad_norm": 0.71968050278163, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 8170 + }, + { + "epoch": 0.08171, + "grad_norm": 0.6543779608988335, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 8171 + }, + { + "epoch": 0.08172, + "grad_norm": 0.6293237774913742, + "learning_rate": 0.003, + "loss": 4.1053, + "step": 8172 + }, + { + "epoch": 0.08173, + "grad_norm": 0.6399157682768067, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 8173 + }, + { + "epoch": 0.08174, + "grad_norm": 0.6938917181866926, + "learning_rate": 0.003, + "loss": 4.1244, + "step": 8174 + }, + { + "epoch": 0.08175, + "grad_norm": 0.6568284294958178, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 8175 + }, + { + "epoch": 0.08176, + "grad_norm": 0.6514625676960912, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 8176 + }, + { + "epoch": 0.08177, + "grad_norm": 0.6436411792736433, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 8177 + }, + { + "epoch": 0.08178, + "grad_norm": 0.6566610790643876, + "learning_rate": 0.003, + "loss": 4.088, + "step": 8178 + }, + { + "epoch": 0.08179, + "grad_norm": 0.6775002449384784, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 8179 + }, + { + "epoch": 0.0818, + "grad_norm": 0.6047058885443021, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 8180 + }, + { + "epoch": 0.08181, + "grad_norm": 0.49255328216218053, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 8181 + }, + { + "epoch": 0.08182, + "grad_norm": 0.6513781988399094, + "learning_rate": 0.003, + "loss": 4.12, + "step": 8182 + }, + { + "epoch": 0.08183, + "grad_norm": 0.756600046387577, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 8183 + }, + { + "epoch": 0.08184, + "grad_norm": 0.8214276567397872, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 8184 + }, + { + "epoch": 0.08185, + "grad_norm": 0.8752383489148959, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 8185 + }, + { + "epoch": 0.08186, + "grad_norm": 0.8441519707828739, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 8186 + }, + { + "epoch": 0.08187, + "grad_norm": 0.7672475601697205, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 8187 + }, + { + "epoch": 0.08188, + "grad_norm": 0.7282380600276215, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 8188 + }, + { + "epoch": 0.08189, + "grad_norm": 0.6846705238578715, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 8189 + }, + { + "epoch": 0.0819, + "grad_norm": 0.7207597619767919, + "learning_rate": 0.003, + "loss": 4.092, + "step": 8190 + }, + { + "epoch": 0.08191, + "grad_norm": 0.7101247154479835, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 8191 + }, + { + "epoch": 0.08192, + "grad_norm": 0.7686485468774331, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 8192 + }, + { + "epoch": 0.08193, + "grad_norm": 0.7889428905614293, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 8193 + }, + { + "epoch": 0.08194, + "grad_norm": 0.9153699308382264, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 8194 + }, + { + "epoch": 0.08195, + "grad_norm": 0.8233344904776073, + "learning_rate": 0.003, + "loss": 4.091, + "step": 8195 + }, + { + "epoch": 0.08196, + "grad_norm": 0.6057143745665352, + "learning_rate": 0.003, + "loss": 4.1161, + "step": 8196 + }, + { + "epoch": 0.08197, + "grad_norm": 0.5644875985686773, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 8197 + }, + { + "epoch": 0.08198, + "grad_norm": 0.5894267537410771, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 8198 + }, + { + "epoch": 0.08199, + "grad_norm": 0.692674390552851, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 8199 + }, + { + "epoch": 0.082, + "grad_norm": 0.7242086618626092, + "learning_rate": 0.003, + "loss": 4.074, + "step": 8200 + }, + { + "epoch": 0.08201, + "grad_norm": 0.7495656742172861, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 8201 + }, + { + "epoch": 0.08202, + "grad_norm": 0.5902754450349533, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 8202 + }, + { + "epoch": 0.08203, + "grad_norm": 0.4939528228400696, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 8203 + }, + { + "epoch": 0.08204, + "grad_norm": 0.5819717473152258, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 8204 + }, + { + "epoch": 0.08205, + "grad_norm": 0.6610578196934844, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 8205 + }, + { + "epoch": 0.08206, + "grad_norm": 0.6833022930778619, + "learning_rate": 0.003, + "loss": 4.12, + "step": 8206 + }, + { + "epoch": 0.08207, + "grad_norm": 0.7109226906588522, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 8207 + }, + { + "epoch": 0.08208, + "grad_norm": 0.6863440404860754, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 8208 + }, + { + "epoch": 0.08209, + "grad_norm": 0.7395650743809461, + "learning_rate": 0.003, + "loss": 4.1132, + "step": 8209 + }, + { + "epoch": 0.0821, + "grad_norm": 0.9883215726717448, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 8210 + }, + { + "epoch": 0.08211, + "grad_norm": 1.2385395849984975, + "learning_rate": 0.003, + "loss": 4.1211, + "step": 8211 + }, + { + "epoch": 0.08212, + "grad_norm": 0.8918876274434122, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 8212 + }, + { + "epoch": 0.08213, + "grad_norm": 0.8581489019919268, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 8213 + }, + { + "epoch": 0.08214, + "grad_norm": 0.8197346780442005, + "learning_rate": 0.003, + "loss": 4.1313, + "step": 8214 + }, + { + "epoch": 0.08215, + "grad_norm": 0.762507621386095, + "learning_rate": 0.003, + "loss": 4.1312, + "step": 8215 + }, + { + "epoch": 0.08216, + "grad_norm": 0.7489221002140064, + "learning_rate": 0.003, + "loss": 4.105, + "step": 8216 + }, + { + "epoch": 0.08217, + "grad_norm": 0.9152826477695661, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 8217 + }, + { + "epoch": 0.08218, + "grad_norm": 1.0382652565254082, + "learning_rate": 0.003, + "loss": 4.1102, + "step": 8218 + }, + { + "epoch": 0.08219, + "grad_norm": 1.026804807644373, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 8219 + }, + { + "epoch": 0.0822, + "grad_norm": 1.0424560881490594, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 8220 + }, + { + "epoch": 0.08221, + "grad_norm": 0.8714512313495743, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 8221 + }, + { + "epoch": 0.08222, + "grad_norm": 0.7651631883270994, + "learning_rate": 0.003, + "loss": 4.1153, + "step": 8222 + }, + { + "epoch": 0.08223, + "grad_norm": 0.729346414097472, + "learning_rate": 0.003, + "loss": 4.125, + "step": 8223 + }, + { + "epoch": 0.08224, + "grad_norm": 0.7979340889752132, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 8224 + }, + { + "epoch": 0.08225, + "grad_norm": 0.7645944759378901, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 8225 + }, + { + "epoch": 0.08226, + "grad_norm": 0.5448138887864806, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 8226 + }, + { + "epoch": 0.08227, + "grad_norm": 0.5856081470610704, + "learning_rate": 0.003, + "loss": 4.1312, + "step": 8227 + }, + { + "epoch": 0.08228, + "grad_norm": 0.6128538428938678, + "learning_rate": 0.003, + "loss": 4.1173, + "step": 8228 + }, + { + "epoch": 0.08229, + "grad_norm": 0.5828543069405947, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 8229 + }, + { + "epoch": 0.0823, + "grad_norm": 0.5252226962559838, + "learning_rate": 0.003, + "loss": 4.1198, + "step": 8230 + }, + { + "epoch": 0.08231, + "grad_norm": 0.5557346682459137, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 8231 + }, + { + "epoch": 0.08232, + "grad_norm": 0.5650481326541311, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 8232 + }, + { + "epoch": 0.08233, + "grad_norm": 0.5634969006102941, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 8233 + }, + { + "epoch": 0.08234, + "grad_norm": 0.5450247074135766, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 8234 + }, + { + "epoch": 0.08235, + "grad_norm": 0.499938817117624, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 8235 + }, + { + "epoch": 0.08236, + "grad_norm": 0.5296381965012722, + "learning_rate": 0.003, + "loss": 4.076, + "step": 8236 + }, + { + "epoch": 0.08237, + "grad_norm": 0.547808654408303, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 8237 + }, + { + "epoch": 0.08238, + "grad_norm": 0.5329237503159692, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 8238 + }, + { + "epoch": 0.08239, + "grad_norm": 0.6012253849536762, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 8239 + }, + { + "epoch": 0.0824, + "grad_norm": 0.7137927518105237, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 8240 + }, + { + "epoch": 0.08241, + "grad_norm": 0.9032854526366563, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 8241 + }, + { + "epoch": 0.08242, + "grad_norm": 1.2043691943864299, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 8242 + }, + { + "epoch": 0.08243, + "grad_norm": 0.8643493496322304, + "learning_rate": 0.003, + "loss": 4.1204, + "step": 8243 + }, + { + "epoch": 0.08244, + "grad_norm": 0.8144027974165489, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 8244 + }, + { + "epoch": 0.08245, + "grad_norm": 0.7635515232637834, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 8245 + }, + { + "epoch": 0.08246, + "grad_norm": 0.7163154200389772, + "learning_rate": 0.003, + "loss": 4.095, + "step": 8246 + }, + { + "epoch": 0.08247, + "grad_norm": 0.748090338264524, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 8247 + }, + { + "epoch": 0.08248, + "grad_norm": 0.5601792675667456, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 8248 + }, + { + "epoch": 0.08249, + "grad_norm": 0.5660826911158605, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 8249 + }, + { + "epoch": 0.0825, + "grad_norm": 0.49777269961971965, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 8250 + }, + { + "epoch": 0.08251, + "grad_norm": 0.4749763269578897, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 8251 + }, + { + "epoch": 0.08252, + "grad_norm": 0.5367890821268732, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 8252 + }, + { + "epoch": 0.08253, + "grad_norm": 0.6209172018514464, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 8253 + }, + { + "epoch": 0.08254, + "grad_norm": 0.7460752011518125, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 8254 + }, + { + "epoch": 0.08255, + "grad_norm": 0.8859483471117526, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 8255 + }, + { + "epoch": 0.08256, + "grad_norm": 0.984289350625384, + "learning_rate": 0.003, + "loss": 4.122, + "step": 8256 + }, + { + "epoch": 0.08257, + "grad_norm": 1.0835433642153152, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 8257 + }, + { + "epoch": 0.08258, + "grad_norm": 0.8632204878935943, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 8258 + }, + { + "epoch": 0.08259, + "grad_norm": 0.7275448679612547, + "learning_rate": 0.003, + "loss": 4.1329, + "step": 8259 + }, + { + "epoch": 0.0826, + "grad_norm": 0.6773298349818122, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 8260 + }, + { + "epoch": 0.08261, + "grad_norm": 0.7281743646317274, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 8261 + }, + { + "epoch": 0.08262, + "grad_norm": 0.9373357855734575, + "learning_rate": 0.003, + "loss": 4.1382, + "step": 8262 + }, + { + "epoch": 0.08263, + "grad_norm": 0.9851510358265243, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 8263 + }, + { + "epoch": 0.08264, + "grad_norm": 0.9213918489568741, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 8264 + }, + { + "epoch": 0.08265, + "grad_norm": 0.8995466212053259, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 8265 + }, + { + "epoch": 0.08266, + "grad_norm": 0.8095431576329132, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 8266 + }, + { + "epoch": 0.08267, + "grad_norm": 0.8603884649749075, + "learning_rate": 0.003, + "loss": 4.1115, + "step": 8267 + }, + { + "epoch": 0.08268, + "grad_norm": 0.8697551596264255, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 8268 + }, + { + "epoch": 0.08269, + "grad_norm": 0.8766282707684185, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 8269 + }, + { + "epoch": 0.0827, + "grad_norm": 0.7471962671773691, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 8270 + }, + { + "epoch": 0.08271, + "grad_norm": 0.6435084919846199, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 8271 + }, + { + "epoch": 0.08272, + "grad_norm": 0.5510784750295802, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 8272 + }, + { + "epoch": 0.08273, + "grad_norm": 0.5988908318224689, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 8273 + }, + { + "epoch": 0.08274, + "grad_norm": 0.6072551102960675, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 8274 + }, + { + "epoch": 0.08275, + "grad_norm": 0.6062453075526276, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 8275 + }, + { + "epoch": 0.08276, + "grad_norm": 0.5278924020145801, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 8276 + }, + { + "epoch": 0.08277, + "grad_norm": 0.5216351289934781, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 8277 + }, + { + "epoch": 0.08278, + "grad_norm": 0.5235445369007313, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 8278 + }, + { + "epoch": 0.08279, + "grad_norm": 0.4302544923490237, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 8279 + }, + { + "epoch": 0.0828, + "grad_norm": 0.45314513906146026, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 8280 + }, + { + "epoch": 0.08281, + "grad_norm": 0.4947819664319072, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 8281 + }, + { + "epoch": 0.08282, + "grad_norm": 0.4700115344785109, + "learning_rate": 0.003, + "loss": 4.075, + "step": 8282 + }, + { + "epoch": 0.08283, + "grad_norm": 0.47458184523582214, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 8283 + }, + { + "epoch": 0.08284, + "grad_norm": 0.6217194123770353, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 8284 + }, + { + "epoch": 0.08285, + "grad_norm": 0.7679318114887002, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 8285 + }, + { + "epoch": 0.08286, + "grad_norm": 1.0091565333262968, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 8286 + }, + { + "epoch": 0.08287, + "grad_norm": 1.0308871533238988, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 8287 + }, + { + "epoch": 0.08288, + "grad_norm": 0.7715553741553134, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 8288 + }, + { + "epoch": 0.08289, + "grad_norm": 0.7246211823017321, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 8289 + }, + { + "epoch": 0.0829, + "grad_norm": 0.6693388476539852, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 8290 + }, + { + "epoch": 0.08291, + "grad_norm": 0.6792730649441251, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 8291 + }, + { + "epoch": 0.08292, + "grad_norm": 0.715764254394465, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 8292 + }, + { + "epoch": 0.08293, + "grad_norm": 0.8330895077359063, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 8293 + }, + { + "epoch": 0.08294, + "grad_norm": 0.8999946195152977, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 8294 + }, + { + "epoch": 0.08295, + "grad_norm": 0.9722144185539187, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 8295 + }, + { + "epoch": 0.08296, + "grad_norm": 0.8945126241906354, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 8296 + }, + { + "epoch": 0.08297, + "grad_norm": 0.931976526589576, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 8297 + }, + { + "epoch": 0.08298, + "grad_norm": 1.0550600149143023, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 8298 + }, + { + "epoch": 0.08299, + "grad_norm": 1.0038590631217534, + "learning_rate": 0.003, + "loss": 4.1169, + "step": 8299 + }, + { + "epoch": 0.083, + "grad_norm": 1.0483622801143657, + "learning_rate": 0.003, + "loss": 4.1615, + "step": 8300 + }, + { + "epoch": 0.08301, + "grad_norm": 0.8663552032951759, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 8301 + }, + { + "epoch": 0.08302, + "grad_norm": 0.9615360827126928, + "learning_rate": 0.003, + "loss": 4.1514, + "step": 8302 + }, + { + "epoch": 0.08303, + "grad_norm": 0.8375481767872405, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 8303 + }, + { + "epoch": 0.08304, + "grad_norm": 0.7453597337442939, + "learning_rate": 0.003, + "loss": 4.1436, + "step": 8304 + }, + { + "epoch": 0.08305, + "grad_norm": 0.7131331051227506, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 8305 + }, + { + "epoch": 0.08306, + "grad_norm": 0.7124866649665565, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 8306 + }, + { + "epoch": 0.08307, + "grad_norm": 0.8250868166173586, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 8307 + }, + { + "epoch": 0.08308, + "grad_norm": 0.9061398647327097, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 8308 + }, + { + "epoch": 0.08309, + "grad_norm": 1.0307311740161893, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 8309 + }, + { + "epoch": 0.0831, + "grad_norm": 0.992744547897445, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 8310 + }, + { + "epoch": 0.08311, + "grad_norm": 0.7488997564809391, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 8311 + }, + { + "epoch": 0.08312, + "grad_norm": 0.7053261274926852, + "learning_rate": 0.003, + "loss": 4.1363, + "step": 8312 + }, + { + "epoch": 0.08313, + "grad_norm": 0.6593948632063114, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 8313 + }, + { + "epoch": 0.08314, + "grad_norm": 0.6995852846034631, + "learning_rate": 0.003, + "loss": 4.1251, + "step": 8314 + }, + { + "epoch": 0.08315, + "grad_norm": 0.7002924867496525, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 8315 + }, + { + "epoch": 0.08316, + "grad_norm": 0.754368216614145, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 8316 + }, + { + "epoch": 0.08317, + "grad_norm": 0.7750729970073041, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 8317 + }, + { + "epoch": 0.08318, + "grad_norm": 0.6364553809945898, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 8318 + }, + { + "epoch": 0.08319, + "grad_norm": 0.5773708268407741, + "learning_rate": 0.003, + "loss": 4.081, + "step": 8319 + }, + { + "epoch": 0.0832, + "grad_norm": 0.6001536418360034, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 8320 + }, + { + "epoch": 0.08321, + "grad_norm": 0.5089768075103857, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 8321 + }, + { + "epoch": 0.08322, + "grad_norm": 0.518428766042433, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 8322 + }, + { + "epoch": 0.08323, + "grad_norm": 0.5173998718635492, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 8323 + }, + { + "epoch": 0.08324, + "grad_norm": 0.4750014989168887, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 8324 + }, + { + "epoch": 0.08325, + "grad_norm": 0.5855181107353905, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 8325 + }, + { + "epoch": 0.08326, + "grad_norm": 0.7628006782684289, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 8326 + }, + { + "epoch": 0.08327, + "grad_norm": 1.0618146389847019, + "learning_rate": 0.003, + "loss": 4.091, + "step": 8327 + }, + { + "epoch": 0.08328, + "grad_norm": 0.9388504158158667, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 8328 + }, + { + "epoch": 0.08329, + "grad_norm": 0.6800766818090532, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 8329 + }, + { + "epoch": 0.0833, + "grad_norm": 0.5773008995872342, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 8330 + }, + { + "epoch": 0.08331, + "grad_norm": 0.5748792212675666, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 8331 + }, + { + "epoch": 0.08332, + "grad_norm": 0.5541285891852565, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 8332 + }, + { + "epoch": 0.08333, + "grad_norm": 0.5416560426836903, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 8333 + }, + { + "epoch": 0.08334, + "grad_norm": 0.5432886614396639, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 8334 + }, + { + "epoch": 0.08335, + "grad_norm": 0.6393368606314609, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 8335 + }, + { + "epoch": 0.08336, + "grad_norm": 0.670088341618419, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 8336 + }, + { + "epoch": 0.08337, + "grad_norm": 0.759669728240697, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 8337 + }, + { + "epoch": 0.08338, + "grad_norm": 0.7670617917071586, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 8338 + }, + { + "epoch": 0.08339, + "grad_norm": 0.7599957118398699, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 8339 + }, + { + "epoch": 0.0834, + "grad_norm": 0.8032482037405414, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 8340 + }, + { + "epoch": 0.08341, + "grad_norm": 0.7742413499026626, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 8341 + }, + { + "epoch": 0.08342, + "grad_norm": 0.7647103140349684, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 8342 + }, + { + "epoch": 0.08343, + "grad_norm": 0.6857455040634841, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 8343 + }, + { + "epoch": 0.08344, + "grad_norm": 0.6285429482356256, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 8344 + }, + { + "epoch": 0.08345, + "grad_norm": 0.7221980546699931, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 8345 + }, + { + "epoch": 0.08346, + "grad_norm": 0.7143054001868141, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 8346 + }, + { + "epoch": 0.08347, + "grad_norm": 0.6385179914801554, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 8347 + }, + { + "epoch": 0.08348, + "grad_norm": 0.6990845577714458, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 8348 + }, + { + "epoch": 0.08349, + "grad_norm": 0.6913429696640464, + "learning_rate": 0.003, + "loss": 4.1008, + "step": 8349 + }, + { + "epoch": 0.0835, + "grad_norm": 0.7192577771703562, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 8350 + }, + { + "epoch": 0.08351, + "grad_norm": 0.7760395598930231, + "learning_rate": 0.003, + "loss": 4.116, + "step": 8351 + }, + { + "epoch": 0.08352, + "grad_norm": 0.7993190167350477, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 8352 + }, + { + "epoch": 0.08353, + "grad_norm": 0.7398688625286849, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 8353 + }, + { + "epoch": 0.08354, + "grad_norm": 0.6834075220306864, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 8354 + }, + { + "epoch": 0.08355, + "grad_norm": 0.8175214131818854, + "learning_rate": 0.003, + "loss": 4.08, + "step": 8355 + }, + { + "epoch": 0.08356, + "grad_norm": 0.9414888867819124, + "learning_rate": 0.003, + "loss": 4.112, + "step": 8356 + }, + { + "epoch": 0.08357, + "grad_norm": 0.960843920438772, + "learning_rate": 0.003, + "loss": 4.1211, + "step": 8357 + }, + { + "epoch": 0.08358, + "grad_norm": 0.9609403468914611, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 8358 + }, + { + "epoch": 0.08359, + "grad_norm": 0.9295346213960959, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 8359 + }, + { + "epoch": 0.0836, + "grad_norm": 0.9085274043267699, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 8360 + }, + { + "epoch": 0.08361, + "grad_norm": 0.8071363639869016, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 8361 + }, + { + "epoch": 0.08362, + "grad_norm": 0.9382775907297787, + "learning_rate": 0.003, + "loss": 4.1279, + "step": 8362 + }, + { + "epoch": 0.08363, + "grad_norm": 0.8709824514663582, + "learning_rate": 0.003, + "loss": 4.1389, + "step": 8363 + }, + { + "epoch": 0.08364, + "grad_norm": 0.675740458123866, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 8364 + }, + { + "epoch": 0.08365, + "grad_norm": 0.6496018714951994, + "learning_rate": 0.003, + "loss": 4.112, + "step": 8365 + }, + { + "epoch": 0.08366, + "grad_norm": 0.6610790017908971, + "learning_rate": 0.003, + "loss": 4.1223, + "step": 8366 + }, + { + "epoch": 0.08367, + "grad_norm": 0.6766179312441892, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 8367 + }, + { + "epoch": 0.08368, + "grad_norm": 0.6048963944008922, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 8368 + }, + { + "epoch": 0.08369, + "grad_norm": 0.5577980801642201, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 8369 + }, + { + "epoch": 0.0837, + "grad_norm": 0.5452065922923452, + "learning_rate": 0.003, + "loss": 4.093, + "step": 8370 + }, + { + "epoch": 0.08371, + "grad_norm": 0.5328227419403814, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 8371 + }, + { + "epoch": 0.08372, + "grad_norm": 0.5572436569854103, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 8372 + }, + { + "epoch": 0.08373, + "grad_norm": 0.595024680031319, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 8373 + }, + { + "epoch": 0.08374, + "grad_norm": 0.6171013061042033, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 8374 + }, + { + "epoch": 0.08375, + "grad_norm": 0.6632173727246423, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 8375 + }, + { + "epoch": 0.08376, + "grad_norm": 0.7586043969017534, + "learning_rate": 0.003, + "loss": 4.084, + "step": 8376 + }, + { + "epoch": 0.08377, + "grad_norm": 0.8201258736560947, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 8377 + }, + { + "epoch": 0.08378, + "grad_norm": 0.7660059240272924, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 8378 + }, + { + "epoch": 0.08379, + "grad_norm": 0.6820454577432933, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 8379 + }, + { + "epoch": 0.0838, + "grad_norm": 0.6391725474755902, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 8380 + }, + { + "epoch": 0.08381, + "grad_norm": 0.626073359728838, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 8381 + }, + { + "epoch": 0.08382, + "grad_norm": 0.6537353438521994, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 8382 + }, + { + "epoch": 0.08383, + "grad_norm": 0.5286621728800187, + "learning_rate": 0.003, + "loss": 4.066, + "step": 8383 + }, + { + "epoch": 0.08384, + "grad_norm": 0.49570196885919826, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 8384 + }, + { + "epoch": 0.08385, + "grad_norm": 0.4720817397929105, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 8385 + }, + { + "epoch": 0.08386, + "grad_norm": 0.4116758591839013, + "learning_rate": 0.003, + "loss": 4.075, + "step": 8386 + }, + { + "epoch": 0.08387, + "grad_norm": 0.45306876446030137, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 8387 + }, + { + "epoch": 0.08388, + "grad_norm": 0.6179847808239425, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 8388 + }, + { + "epoch": 0.08389, + "grad_norm": 0.9294947853793362, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 8389 + }, + { + "epoch": 0.0839, + "grad_norm": 1.2031306786524287, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 8390 + }, + { + "epoch": 0.08391, + "grad_norm": 0.8046262538632072, + "learning_rate": 0.003, + "loss": 4.109, + "step": 8391 + }, + { + "epoch": 0.08392, + "grad_norm": 0.882493419185855, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 8392 + }, + { + "epoch": 0.08393, + "grad_norm": 0.8613668633518297, + "learning_rate": 0.003, + "loss": 4.1164, + "step": 8393 + }, + { + "epoch": 0.08394, + "grad_norm": 0.8844121298915572, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 8394 + }, + { + "epoch": 0.08395, + "grad_norm": 0.9156972707334563, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 8395 + }, + { + "epoch": 0.08396, + "grad_norm": 0.8431255266855052, + "learning_rate": 0.003, + "loss": 4.1244, + "step": 8396 + }, + { + "epoch": 0.08397, + "grad_norm": 0.8830670687070312, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 8397 + }, + { + "epoch": 0.08398, + "grad_norm": 0.9314448769669744, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 8398 + }, + { + "epoch": 0.08399, + "grad_norm": 1.0449422905278423, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 8399 + }, + { + "epoch": 0.084, + "grad_norm": 1.0703912427847535, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 8400 + }, + { + "epoch": 0.08401, + "grad_norm": 0.99544544738125, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 8401 + }, + { + "epoch": 0.08402, + "grad_norm": 0.9137811882497271, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 8402 + }, + { + "epoch": 0.08403, + "grad_norm": 0.8345353163377343, + "learning_rate": 0.003, + "loss": 4.1055, + "step": 8403 + }, + { + "epoch": 0.08404, + "grad_norm": 0.7423619551465458, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 8404 + }, + { + "epoch": 0.08405, + "grad_norm": 0.6917338055216923, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 8405 + }, + { + "epoch": 0.08406, + "grad_norm": 0.6541321327571565, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 8406 + }, + { + "epoch": 0.08407, + "grad_norm": 0.6748689162799533, + "learning_rate": 0.003, + "loss": 4.153, + "step": 8407 + }, + { + "epoch": 0.08408, + "grad_norm": 0.6636547398677606, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 8408 + }, + { + "epoch": 0.08409, + "grad_norm": 0.6820702144071761, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 8409 + }, + { + "epoch": 0.0841, + "grad_norm": 0.6703929070086052, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 8410 + }, + { + "epoch": 0.08411, + "grad_norm": 0.6537163754004126, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 8411 + }, + { + "epoch": 0.08412, + "grad_norm": 0.6180851692722509, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 8412 + }, + { + "epoch": 0.08413, + "grad_norm": 0.5613447966463775, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 8413 + }, + { + "epoch": 0.08414, + "grad_norm": 0.5426935541427719, + "learning_rate": 0.003, + "loss": 4.109, + "step": 8414 + }, + { + "epoch": 0.08415, + "grad_norm": 0.5455362132623575, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 8415 + }, + { + "epoch": 0.08416, + "grad_norm": 0.5671062554035091, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 8416 + }, + { + "epoch": 0.08417, + "grad_norm": 0.6929581320753861, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 8417 + }, + { + "epoch": 0.08418, + "grad_norm": 0.7632658007580719, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 8418 + }, + { + "epoch": 0.08419, + "grad_norm": 0.8207298972699563, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 8419 + }, + { + "epoch": 0.0842, + "grad_norm": 0.8838166900993079, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 8420 + }, + { + "epoch": 0.08421, + "grad_norm": 0.8494239057110717, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 8421 + }, + { + "epoch": 0.08422, + "grad_norm": 0.7486051228897643, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 8422 + }, + { + "epoch": 0.08423, + "grad_norm": 0.6681036680494669, + "learning_rate": 0.003, + "loss": 4.092, + "step": 8423 + }, + { + "epoch": 0.08424, + "grad_norm": 0.6231615422695527, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 8424 + }, + { + "epoch": 0.08425, + "grad_norm": 0.5973371615272063, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 8425 + }, + { + "epoch": 0.08426, + "grad_norm": 0.590081914250373, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 8426 + }, + { + "epoch": 0.08427, + "grad_norm": 0.7124778189115221, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 8427 + }, + { + "epoch": 0.08428, + "grad_norm": 0.7887937264015177, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 8428 + }, + { + "epoch": 0.08429, + "grad_norm": 0.7678709133144459, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 8429 + }, + { + "epoch": 0.0843, + "grad_norm": 0.6332332135809398, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 8430 + }, + { + "epoch": 0.08431, + "grad_norm": 0.5107659296097526, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 8431 + }, + { + "epoch": 0.08432, + "grad_norm": 0.48858809488607446, + "learning_rate": 0.003, + "loss": 4.057, + "step": 8432 + }, + { + "epoch": 0.08433, + "grad_norm": 0.5110654376932778, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 8433 + }, + { + "epoch": 0.08434, + "grad_norm": 0.534360910387737, + "learning_rate": 0.003, + "loss": 4.048, + "step": 8434 + }, + { + "epoch": 0.08435, + "grad_norm": 0.5820631960297242, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 8435 + }, + { + "epoch": 0.08436, + "grad_norm": 0.661389118471186, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 8436 + }, + { + "epoch": 0.08437, + "grad_norm": 0.7111949328987919, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 8437 + }, + { + "epoch": 0.08438, + "grad_norm": 0.7522347549752256, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 8438 + }, + { + "epoch": 0.08439, + "grad_norm": 0.9493832565098858, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 8439 + }, + { + "epoch": 0.0844, + "grad_norm": 1.0648764993147652, + "learning_rate": 0.003, + "loss": 4.1258, + "step": 8440 + }, + { + "epoch": 0.08441, + "grad_norm": 0.8414607606679769, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 8441 + }, + { + "epoch": 0.08442, + "grad_norm": 0.7257158697574967, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 8442 + }, + { + "epoch": 0.08443, + "grad_norm": 0.6399687963227431, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 8443 + }, + { + "epoch": 0.08444, + "grad_norm": 0.6122601189245481, + "learning_rate": 0.003, + "loss": 4.126, + "step": 8444 + }, + { + "epoch": 0.08445, + "grad_norm": 0.6421561307708404, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 8445 + }, + { + "epoch": 0.08446, + "grad_norm": 0.7720968877758178, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 8446 + }, + { + "epoch": 0.08447, + "grad_norm": 0.9969368300751122, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 8447 + }, + { + "epoch": 0.08448, + "grad_norm": 1.040230047556125, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 8448 + }, + { + "epoch": 0.08449, + "grad_norm": 0.9375823755529817, + "learning_rate": 0.003, + "loss": 4.1266, + "step": 8449 + }, + { + "epoch": 0.0845, + "grad_norm": 0.8771502461318594, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 8450 + }, + { + "epoch": 0.08451, + "grad_norm": 0.7739620306437573, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 8451 + }, + { + "epoch": 0.08452, + "grad_norm": 0.673756075602562, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 8452 + }, + { + "epoch": 0.08453, + "grad_norm": 0.6543280916386777, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 8453 + }, + { + "epoch": 0.08454, + "grad_norm": 0.6889773663662369, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 8454 + }, + { + "epoch": 0.08455, + "grad_norm": 0.7838053719288101, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 8455 + }, + { + "epoch": 0.08456, + "grad_norm": 0.8119139733359273, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 8456 + }, + { + "epoch": 0.08457, + "grad_norm": 0.6837577523016988, + "learning_rate": 0.003, + "loss": 4.1321, + "step": 8457 + }, + { + "epoch": 0.08458, + "grad_norm": 0.6923132881641682, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 8458 + }, + { + "epoch": 0.08459, + "grad_norm": 0.7691218378497301, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 8459 + }, + { + "epoch": 0.0846, + "grad_norm": 0.7521614723982595, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 8460 + }, + { + "epoch": 0.08461, + "grad_norm": 0.7593018376927517, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 8461 + }, + { + "epoch": 0.08462, + "grad_norm": 0.8660621840990166, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 8462 + }, + { + "epoch": 0.08463, + "grad_norm": 0.832579407041644, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 8463 + }, + { + "epoch": 0.08464, + "grad_norm": 0.8496293669573718, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 8464 + }, + { + "epoch": 0.08465, + "grad_norm": 0.9447869224057244, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 8465 + }, + { + "epoch": 0.08466, + "grad_norm": 0.9693005746173255, + "learning_rate": 0.003, + "loss": 4.129, + "step": 8466 + }, + { + "epoch": 0.08467, + "grad_norm": 0.6866322634836225, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 8467 + }, + { + "epoch": 0.08468, + "grad_norm": 0.7903651480232442, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 8468 + }, + { + "epoch": 0.08469, + "grad_norm": 0.7645618738122506, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 8469 + }, + { + "epoch": 0.0847, + "grad_norm": 0.7413695835335838, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 8470 + }, + { + "epoch": 0.08471, + "grad_norm": 0.7231784570238742, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 8471 + }, + { + "epoch": 0.08472, + "grad_norm": 0.931451402440091, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 8472 + }, + { + "epoch": 0.08473, + "grad_norm": 1.1881300365948153, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 8473 + }, + { + "epoch": 0.08474, + "grad_norm": 0.7884945285700253, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 8474 + }, + { + "epoch": 0.08475, + "grad_norm": 0.6466505880641876, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 8475 + }, + { + "epoch": 0.08476, + "grad_norm": 0.6843402752034309, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 8476 + }, + { + "epoch": 0.08477, + "grad_norm": 0.6820809860569809, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 8477 + }, + { + "epoch": 0.08478, + "grad_norm": 0.6640365539259963, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 8478 + }, + { + "epoch": 0.08479, + "grad_norm": 0.6014643687487943, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 8479 + }, + { + "epoch": 0.0848, + "grad_norm": 0.6525358299661146, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 8480 + }, + { + "epoch": 0.08481, + "grad_norm": 0.706918999582302, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 8481 + }, + { + "epoch": 0.08482, + "grad_norm": 0.7468135222494826, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 8482 + }, + { + "epoch": 0.08483, + "grad_norm": 0.7732979758093033, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 8483 + }, + { + "epoch": 0.08484, + "grad_norm": 0.7492155564600703, + "learning_rate": 0.003, + "loss": 4.074, + "step": 8484 + }, + { + "epoch": 0.08485, + "grad_norm": 0.6786121897383414, + "learning_rate": 0.003, + "loss": 4.1, + "step": 8485 + }, + { + "epoch": 0.08486, + "grad_norm": 0.6273731355412779, + "learning_rate": 0.003, + "loss": 4.12, + "step": 8486 + }, + { + "epoch": 0.08487, + "grad_norm": 0.6720801463551171, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 8487 + }, + { + "epoch": 0.08488, + "grad_norm": 0.6335377972042376, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 8488 + }, + { + "epoch": 0.08489, + "grad_norm": 0.6719260396651473, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 8489 + }, + { + "epoch": 0.0849, + "grad_norm": 0.7312131012473647, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 8490 + }, + { + "epoch": 0.08491, + "grad_norm": 0.8249047120148688, + "learning_rate": 0.003, + "loss": 4.1104, + "step": 8491 + }, + { + "epoch": 0.08492, + "grad_norm": 1.0407015940181197, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 8492 + }, + { + "epoch": 0.08493, + "grad_norm": 1.0867956366330738, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 8493 + }, + { + "epoch": 0.08494, + "grad_norm": 0.915620699057293, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 8494 + }, + { + "epoch": 0.08495, + "grad_norm": 0.89156893468348, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 8495 + }, + { + "epoch": 0.08496, + "grad_norm": 0.8517601087707647, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 8496 + }, + { + "epoch": 0.08497, + "grad_norm": 0.7993759869320393, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 8497 + }, + { + "epoch": 0.08498, + "grad_norm": 0.7017883148327033, + "learning_rate": 0.003, + "loss": 4.1063, + "step": 8498 + }, + { + "epoch": 0.08499, + "grad_norm": 0.6236773075397719, + "learning_rate": 0.003, + "loss": 4.1259, + "step": 8499 + }, + { + "epoch": 0.085, + "grad_norm": 0.6286035320618882, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 8500 + }, + { + "epoch": 0.08501, + "grad_norm": 0.5212820111963481, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 8501 + }, + { + "epoch": 0.08502, + "grad_norm": 0.5487558855281737, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 8502 + }, + { + "epoch": 0.08503, + "grad_norm": 0.5707569684852064, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 8503 + }, + { + "epoch": 0.08504, + "grad_norm": 0.6687161931414275, + "learning_rate": 0.003, + "loss": 4.1237, + "step": 8504 + }, + { + "epoch": 0.08505, + "grad_norm": 0.6723026556005183, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 8505 + }, + { + "epoch": 0.08506, + "grad_norm": 0.6565086694246933, + "learning_rate": 0.003, + "loss": 4.1034, + "step": 8506 + }, + { + "epoch": 0.08507, + "grad_norm": 0.6310941478662354, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 8507 + }, + { + "epoch": 0.08508, + "grad_norm": 0.5824407794995421, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 8508 + }, + { + "epoch": 0.08509, + "grad_norm": 0.570213160176025, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 8509 + }, + { + "epoch": 0.0851, + "grad_norm": 0.545653653993305, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 8510 + }, + { + "epoch": 0.08511, + "grad_norm": 0.6166091785522915, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 8511 + }, + { + "epoch": 0.08512, + "grad_norm": 0.7266811939494829, + "learning_rate": 0.003, + "loss": 4.1261, + "step": 8512 + }, + { + "epoch": 0.08513, + "grad_norm": 1.0119925425784506, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 8513 + }, + { + "epoch": 0.08514, + "grad_norm": 1.0618994395885715, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 8514 + }, + { + "epoch": 0.08515, + "grad_norm": 0.8410180154663229, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 8515 + }, + { + "epoch": 0.08516, + "grad_norm": 0.7647905005211172, + "learning_rate": 0.003, + "loss": 4.1223, + "step": 8516 + }, + { + "epoch": 0.08517, + "grad_norm": 0.7839452502836995, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 8517 + }, + { + "epoch": 0.08518, + "grad_norm": 0.8837758490786244, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 8518 + }, + { + "epoch": 0.08519, + "grad_norm": 1.0026623503391587, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 8519 + }, + { + "epoch": 0.0852, + "grad_norm": 0.8804069304034173, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 8520 + }, + { + "epoch": 0.08521, + "grad_norm": 0.7391980251114973, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 8521 + }, + { + "epoch": 0.08522, + "grad_norm": 0.6582209926239634, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 8522 + }, + { + "epoch": 0.08523, + "grad_norm": 0.686774998714554, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 8523 + }, + { + "epoch": 0.08524, + "grad_norm": 0.6276671588615992, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 8524 + }, + { + "epoch": 0.08525, + "grad_norm": 0.6285944259527609, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 8525 + }, + { + "epoch": 0.08526, + "grad_norm": 0.6730183358064831, + "learning_rate": 0.003, + "loss": 4.088, + "step": 8526 + }, + { + "epoch": 0.08527, + "grad_norm": 0.7562963445445285, + "learning_rate": 0.003, + "loss": 4.091, + "step": 8527 + }, + { + "epoch": 0.08528, + "grad_norm": 0.7728429342633116, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 8528 + }, + { + "epoch": 0.08529, + "grad_norm": 0.765937905688893, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 8529 + }, + { + "epoch": 0.0853, + "grad_norm": 0.6845167962734888, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 8530 + }, + { + "epoch": 0.08531, + "grad_norm": 0.5726987582988281, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 8531 + }, + { + "epoch": 0.08532, + "grad_norm": 0.5597095167661914, + "learning_rate": 0.003, + "loss": 4.068, + "step": 8532 + }, + { + "epoch": 0.08533, + "grad_norm": 0.5041472063185792, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 8533 + }, + { + "epoch": 0.08534, + "grad_norm": 0.45851677887407977, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 8534 + }, + { + "epoch": 0.08535, + "grad_norm": 0.42358429319858426, + "learning_rate": 0.003, + "loss": 4.073, + "step": 8535 + }, + { + "epoch": 0.08536, + "grad_norm": 0.424584021481874, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 8536 + }, + { + "epoch": 0.08537, + "grad_norm": 0.5509844324435204, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 8537 + }, + { + "epoch": 0.08538, + "grad_norm": 0.7117815534754696, + "learning_rate": 0.003, + "loss": 4.066, + "step": 8538 + }, + { + "epoch": 0.08539, + "grad_norm": 0.8518554101874906, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 8539 + }, + { + "epoch": 0.0854, + "grad_norm": 0.9392658629366815, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 8540 + }, + { + "epoch": 0.08541, + "grad_norm": 0.9325601448881679, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 8541 + }, + { + "epoch": 0.08542, + "grad_norm": 0.8887155749787176, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 8542 + }, + { + "epoch": 0.08543, + "grad_norm": 0.9460176301826329, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 8543 + }, + { + "epoch": 0.08544, + "grad_norm": 0.8622072400696076, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 8544 + }, + { + "epoch": 0.08545, + "grad_norm": 0.8420027078116906, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 8545 + }, + { + "epoch": 0.08546, + "grad_norm": 0.7341692131081741, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 8546 + }, + { + "epoch": 0.08547, + "grad_norm": 0.7543016815952612, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 8547 + }, + { + "epoch": 0.08548, + "grad_norm": 0.7607341436346121, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 8548 + }, + { + "epoch": 0.08549, + "grad_norm": 0.7660211842461696, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 8549 + }, + { + "epoch": 0.0855, + "grad_norm": 0.8328996624474356, + "learning_rate": 0.003, + "loss": 4.127, + "step": 8550 + }, + { + "epoch": 0.08551, + "grad_norm": 0.9887027893940781, + "learning_rate": 0.003, + "loss": 4.1344, + "step": 8551 + }, + { + "epoch": 0.08552, + "grad_norm": 0.9138699679462117, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 8552 + }, + { + "epoch": 0.08553, + "grad_norm": 0.88796138098534, + "learning_rate": 0.003, + "loss": 4.1251, + "step": 8553 + }, + { + "epoch": 0.08554, + "grad_norm": 0.7864887844624548, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 8554 + }, + { + "epoch": 0.08555, + "grad_norm": 0.8150705947614163, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 8555 + }, + { + "epoch": 0.08556, + "grad_norm": 0.8837241002629734, + "learning_rate": 0.003, + "loss": 4.1262, + "step": 8556 + }, + { + "epoch": 0.08557, + "grad_norm": 0.9289452003464579, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 8557 + }, + { + "epoch": 0.08558, + "grad_norm": 0.9018548936220813, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 8558 + }, + { + "epoch": 0.08559, + "grad_norm": 0.8317138215654469, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 8559 + }, + { + "epoch": 0.0856, + "grad_norm": 0.7222621649910216, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 8560 + }, + { + "epoch": 0.08561, + "grad_norm": 0.6904128334298788, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 8561 + }, + { + "epoch": 0.08562, + "grad_norm": 0.6940485635066651, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 8562 + }, + { + "epoch": 0.08563, + "grad_norm": 0.7325452827552434, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 8563 + }, + { + "epoch": 0.08564, + "grad_norm": 0.8123368725671672, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 8564 + }, + { + "epoch": 0.08565, + "grad_norm": 0.9408503931561969, + "learning_rate": 0.003, + "loss": 4.096, + "step": 8565 + }, + { + "epoch": 0.08566, + "grad_norm": 1.0616374748054886, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 8566 + }, + { + "epoch": 0.08567, + "grad_norm": 1.047024424521824, + "learning_rate": 0.003, + "loss": 4.1249, + "step": 8567 + }, + { + "epoch": 0.08568, + "grad_norm": 0.8468698345129514, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 8568 + }, + { + "epoch": 0.08569, + "grad_norm": 0.7944472675814765, + "learning_rate": 0.003, + "loss": 4.1382, + "step": 8569 + }, + { + "epoch": 0.0857, + "grad_norm": 0.8045393385309881, + "learning_rate": 0.003, + "loss": 4.1327, + "step": 8570 + }, + { + "epoch": 0.08571, + "grad_norm": 0.9098867331516741, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 8571 + }, + { + "epoch": 0.08572, + "grad_norm": 0.9195544563171938, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 8572 + }, + { + "epoch": 0.08573, + "grad_norm": 0.8423884949796944, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 8573 + }, + { + "epoch": 0.08574, + "grad_norm": 0.8214333018685611, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 8574 + }, + { + "epoch": 0.08575, + "grad_norm": 0.6350812617594055, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 8575 + }, + { + "epoch": 0.08576, + "grad_norm": 0.5568431689715675, + "learning_rate": 0.003, + "loss": 4.1008, + "step": 8576 + }, + { + "epoch": 0.08577, + "grad_norm": 0.5632148561347211, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 8577 + }, + { + "epoch": 0.08578, + "grad_norm": 0.5940159555341951, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 8578 + }, + { + "epoch": 0.08579, + "grad_norm": 0.5819842281049936, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 8579 + }, + { + "epoch": 0.0858, + "grad_norm": 0.6099409069272952, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 8580 + }, + { + "epoch": 0.08581, + "grad_norm": 0.6188694008443366, + "learning_rate": 0.003, + "loss": 4.081, + "step": 8581 + }, + { + "epoch": 0.08582, + "grad_norm": 0.59379978193371, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 8582 + }, + { + "epoch": 0.08583, + "grad_norm": 0.5744750135944172, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 8583 + }, + { + "epoch": 0.08584, + "grad_norm": 0.6722444371986878, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 8584 + }, + { + "epoch": 0.08585, + "grad_norm": 0.7411902899854854, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 8585 + }, + { + "epoch": 0.08586, + "grad_norm": 0.7792170934870236, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 8586 + }, + { + "epoch": 0.08587, + "grad_norm": 0.6853345642159595, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 8587 + }, + { + "epoch": 0.08588, + "grad_norm": 0.6170369949749352, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 8588 + }, + { + "epoch": 0.08589, + "grad_norm": 0.49708112672361066, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 8589 + }, + { + "epoch": 0.0859, + "grad_norm": 0.5152160677520728, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 8590 + }, + { + "epoch": 0.08591, + "grad_norm": 0.49741486517713757, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 8591 + }, + { + "epoch": 0.08592, + "grad_norm": 0.5910307086102645, + "learning_rate": 0.003, + "loss": 4.1104, + "step": 8592 + }, + { + "epoch": 0.08593, + "grad_norm": 0.6649076276760316, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 8593 + }, + { + "epoch": 0.08594, + "grad_norm": 0.6902152865247433, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 8594 + }, + { + "epoch": 0.08595, + "grad_norm": 0.7200015025904549, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 8595 + }, + { + "epoch": 0.08596, + "grad_norm": 0.8369502085711913, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 8596 + }, + { + "epoch": 0.08597, + "grad_norm": 0.9081695796712485, + "learning_rate": 0.003, + "loss": 4.1094, + "step": 8597 + }, + { + "epoch": 0.08598, + "grad_norm": 0.8574484765894195, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 8598 + }, + { + "epoch": 0.08599, + "grad_norm": 0.7141093112530459, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 8599 + }, + { + "epoch": 0.086, + "grad_norm": 0.7618404036518704, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 8600 + }, + { + "epoch": 0.08601, + "grad_norm": 0.7471246187217334, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 8601 + }, + { + "epoch": 0.08602, + "grad_norm": 0.7547952980026077, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 8602 + }, + { + "epoch": 0.08603, + "grad_norm": 0.7629507458741042, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 8603 + }, + { + "epoch": 0.08604, + "grad_norm": 0.8034444554972223, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 8604 + }, + { + "epoch": 0.08605, + "grad_norm": 0.9611109929996176, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 8605 + }, + { + "epoch": 0.08606, + "grad_norm": 1.1203243516217172, + "learning_rate": 0.003, + "loss": 4.091, + "step": 8606 + }, + { + "epoch": 0.08607, + "grad_norm": 0.7916021324508382, + "learning_rate": 0.003, + "loss": 4.127, + "step": 8607 + }, + { + "epoch": 0.08608, + "grad_norm": 0.5801180546262028, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 8608 + }, + { + "epoch": 0.08609, + "grad_norm": 0.5983815121428754, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 8609 + }, + { + "epoch": 0.0861, + "grad_norm": 0.6226924849818875, + "learning_rate": 0.003, + "loss": 4.111, + "step": 8610 + }, + { + "epoch": 0.08611, + "grad_norm": 0.5973475758608405, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 8611 + }, + { + "epoch": 0.08612, + "grad_norm": 0.6361887564027072, + "learning_rate": 0.003, + "loss": 4.1275, + "step": 8612 + }, + { + "epoch": 0.08613, + "grad_norm": 0.6913314348371322, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 8613 + }, + { + "epoch": 0.08614, + "grad_norm": 0.6911565582790656, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 8614 + }, + { + "epoch": 0.08615, + "grad_norm": 0.6480201878105682, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 8615 + }, + { + "epoch": 0.08616, + "grad_norm": 0.621140757257243, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 8616 + }, + { + "epoch": 0.08617, + "grad_norm": 0.5814187395920917, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 8617 + }, + { + "epoch": 0.08618, + "grad_norm": 0.5949195662749583, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 8618 + }, + { + "epoch": 0.08619, + "grad_norm": 0.5474225046144992, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 8619 + }, + { + "epoch": 0.0862, + "grad_norm": 0.5884294701344107, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 8620 + }, + { + "epoch": 0.08621, + "grad_norm": 0.5414477831138819, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 8621 + }, + { + "epoch": 0.08622, + "grad_norm": 0.5853379182460213, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 8622 + }, + { + "epoch": 0.08623, + "grad_norm": 0.6748785821043584, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 8623 + }, + { + "epoch": 0.08624, + "grad_norm": 0.8007913851325251, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 8624 + }, + { + "epoch": 0.08625, + "grad_norm": 0.9532221305960991, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 8625 + }, + { + "epoch": 0.08626, + "grad_norm": 0.9383143976468733, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 8626 + }, + { + "epoch": 0.08627, + "grad_norm": 0.8045743378009347, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 8627 + }, + { + "epoch": 0.08628, + "grad_norm": 0.7402532485299439, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 8628 + }, + { + "epoch": 0.08629, + "grad_norm": 0.7187195965365745, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 8629 + }, + { + "epoch": 0.0863, + "grad_norm": 0.7749136270046196, + "learning_rate": 0.003, + "loss": 4.086, + "step": 8630 + }, + { + "epoch": 0.08631, + "grad_norm": 0.7803939926856307, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 8631 + }, + { + "epoch": 0.08632, + "grad_norm": 0.812820737542032, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 8632 + }, + { + "epoch": 0.08633, + "grad_norm": 0.9418729107920155, + "learning_rate": 0.003, + "loss": 4.1319, + "step": 8633 + }, + { + "epoch": 0.08634, + "grad_norm": 1.0595248522005765, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 8634 + }, + { + "epoch": 0.08635, + "grad_norm": 0.9627374328257062, + "learning_rate": 0.003, + "loss": 4.112, + "step": 8635 + }, + { + "epoch": 0.08636, + "grad_norm": 0.9731387011493425, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 8636 + }, + { + "epoch": 0.08637, + "grad_norm": 1.0615767557197648, + "learning_rate": 0.003, + "loss": 4.1494, + "step": 8637 + }, + { + "epoch": 0.08638, + "grad_norm": 0.8969163900961645, + "learning_rate": 0.003, + "loss": 4.1354, + "step": 8638 + }, + { + "epoch": 0.08639, + "grad_norm": 0.7836065427193415, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 8639 + }, + { + "epoch": 0.0864, + "grad_norm": 0.7397344262293751, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 8640 + }, + { + "epoch": 0.08641, + "grad_norm": 0.7324947749888412, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 8641 + }, + { + "epoch": 0.08642, + "grad_norm": 0.7288716146484082, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 8642 + }, + { + "epoch": 0.08643, + "grad_norm": 0.7339841907154421, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 8643 + }, + { + "epoch": 0.08644, + "grad_norm": 0.8107938826013737, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 8644 + }, + { + "epoch": 0.08645, + "grad_norm": 0.9556271679368206, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 8645 + }, + { + "epoch": 0.08646, + "grad_norm": 0.9674772661574546, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 8646 + }, + { + "epoch": 0.08647, + "grad_norm": 0.9453669207011212, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 8647 + }, + { + "epoch": 0.08648, + "grad_norm": 0.7876425030935472, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 8648 + }, + { + "epoch": 0.08649, + "grad_norm": 0.6724723608937945, + "learning_rate": 0.003, + "loss": 4.1303, + "step": 8649 + }, + { + "epoch": 0.0865, + "grad_norm": 0.7454146637283006, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 8650 + }, + { + "epoch": 0.08651, + "grad_norm": 0.8517115223988251, + "learning_rate": 0.003, + "loss": 4.1152, + "step": 8651 + }, + { + "epoch": 0.08652, + "grad_norm": 0.8860899877841061, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 8652 + }, + { + "epoch": 0.08653, + "grad_norm": 0.6986085802414974, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 8653 + }, + { + "epoch": 0.08654, + "grad_norm": 0.6383445520115142, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 8654 + }, + { + "epoch": 0.08655, + "grad_norm": 0.6126837697223702, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 8655 + }, + { + "epoch": 0.08656, + "grad_norm": 0.6303132972548725, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 8656 + }, + { + "epoch": 0.08657, + "grad_norm": 0.7051801520736573, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 8657 + }, + { + "epoch": 0.08658, + "grad_norm": 0.7534311537900272, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 8658 + }, + { + "epoch": 0.08659, + "grad_norm": 0.6915534733946069, + "learning_rate": 0.003, + "loss": 4.095, + "step": 8659 + }, + { + "epoch": 0.0866, + "grad_norm": 0.6192069664136859, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 8660 + }, + { + "epoch": 0.08661, + "grad_norm": 0.5572612681652647, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 8661 + }, + { + "epoch": 0.08662, + "grad_norm": 0.518904918411408, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 8662 + }, + { + "epoch": 0.08663, + "grad_norm": 0.45643753659454445, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 8663 + }, + { + "epoch": 0.08664, + "grad_norm": 0.40977161141131885, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 8664 + }, + { + "epoch": 0.08665, + "grad_norm": 0.3878668107895774, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 8665 + }, + { + "epoch": 0.08666, + "grad_norm": 0.413430960164737, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 8666 + }, + { + "epoch": 0.08667, + "grad_norm": 0.49272907946966926, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 8667 + }, + { + "epoch": 0.08668, + "grad_norm": 0.5766422713262668, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 8668 + }, + { + "epoch": 0.08669, + "grad_norm": 0.7090374371716733, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 8669 + }, + { + "epoch": 0.0867, + "grad_norm": 0.8903287925695574, + "learning_rate": 0.003, + "loss": 4.084, + "step": 8670 + }, + { + "epoch": 0.08671, + "grad_norm": 1.0465570588393474, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 8671 + }, + { + "epoch": 0.08672, + "grad_norm": 0.9634733776041171, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 8672 + }, + { + "epoch": 0.08673, + "grad_norm": 0.9660580246254519, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 8673 + }, + { + "epoch": 0.08674, + "grad_norm": 0.904283629793719, + "learning_rate": 0.003, + "loss": 4.1153, + "step": 8674 + }, + { + "epoch": 0.08675, + "grad_norm": 0.8438104142985071, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 8675 + }, + { + "epoch": 0.08676, + "grad_norm": 0.7857620375537205, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 8676 + }, + { + "epoch": 0.08677, + "grad_norm": 0.7969369610845204, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 8677 + }, + { + "epoch": 0.08678, + "grad_norm": 0.7396002144781133, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 8678 + }, + { + "epoch": 0.08679, + "grad_norm": 0.6673116262175337, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 8679 + }, + { + "epoch": 0.0868, + "grad_norm": 0.7555945744269758, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 8680 + }, + { + "epoch": 0.08681, + "grad_norm": 0.8839352666831195, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 8681 + }, + { + "epoch": 0.08682, + "grad_norm": 0.9051922142113155, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 8682 + }, + { + "epoch": 0.08683, + "grad_norm": 0.846027323488606, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 8683 + }, + { + "epoch": 0.08684, + "grad_norm": 0.7192665361560291, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 8684 + }, + { + "epoch": 0.08685, + "grad_norm": 0.643632971790141, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 8685 + }, + { + "epoch": 0.08686, + "grad_norm": 0.6466143794382575, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 8686 + }, + { + "epoch": 0.08687, + "grad_norm": 0.7039176614373789, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 8687 + }, + { + "epoch": 0.08688, + "grad_norm": 0.7899249787925823, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 8688 + }, + { + "epoch": 0.08689, + "grad_norm": 0.8216415805263771, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 8689 + }, + { + "epoch": 0.0869, + "grad_norm": 0.7040479359385806, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 8690 + }, + { + "epoch": 0.08691, + "grad_norm": 0.6418926125651642, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 8691 + }, + { + "epoch": 0.08692, + "grad_norm": 0.7077297817088414, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 8692 + }, + { + "epoch": 0.08693, + "grad_norm": 0.7606697220731845, + "learning_rate": 0.003, + "loss": 4.076, + "step": 8693 + }, + { + "epoch": 0.08694, + "grad_norm": 0.7152450767122566, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 8694 + }, + { + "epoch": 0.08695, + "grad_norm": 0.6231380065971586, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 8695 + }, + { + "epoch": 0.08696, + "grad_norm": 0.5828288246085719, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 8696 + }, + { + "epoch": 0.08697, + "grad_norm": 0.5953032303327713, + "learning_rate": 0.003, + "loss": 4.105, + "step": 8697 + }, + { + "epoch": 0.08698, + "grad_norm": 0.6083787084008646, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 8698 + }, + { + "epoch": 0.08699, + "grad_norm": 0.7378994678558721, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 8699 + }, + { + "epoch": 0.087, + "grad_norm": 0.9161671171702003, + "learning_rate": 0.003, + "loss": 4.1235, + "step": 8700 + }, + { + "epoch": 0.08701, + "grad_norm": 1.1377552843109822, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 8701 + }, + { + "epoch": 0.08702, + "grad_norm": 0.8079302042495463, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 8702 + }, + { + "epoch": 0.08703, + "grad_norm": 0.608371143181045, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 8703 + }, + { + "epoch": 0.08704, + "grad_norm": 0.6281858127290479, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 8704 + }, + { + "epoch": 0.08705, + "grad_norm": 0.6793561925183917, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 8705 + }, + { + "epoch": 0.08706, + "grad_norm": 0.7106800046852197, + "learning_rate": 0.003, + "loss": 4.085, + "step": 8706 + }, + { + "epoch": 0.08707, + "grad_norm": 0.7141650332904569, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 8707 + }, + { + "epoch": 0.08708, + "grad_norm": 0.8271404025231711, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 8708 + }, + { + "epoch": 0.08709, + "grad_norm": 1.0474926164726928, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 8709 + }, + { + "epoch": 0.0871, + "grad_norm": 0.9907087620885723, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 8710 + }, + { + "epoch": 0.08711, + "grad_norm": 1.0656047756713767, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 8711 + }, + { + "epoch": 0.08712, + "grad_norm": 0.881294661258653, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 8712 + }, + { + "epoch": 0.08713, + "grad_norm": 0.9263143666012618, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 8713 + }, + { + "epoch": 0.08714, + "grad_norm": 0.9652223357853811, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 8714 + }, + { + "epoch": 0.08715, + "grad_norm": 1.1556407233323303, + "learning_rate": 0.003, + "loss": 4.1354, + "step": 8715 + }, + { + "epoch": 0.08716, + "grad_norm": 0.8600125406687056, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 8716 + }, + { + "epoch": 0.08717, + "grad_norm": 0.7952102732113174, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 8717 + }, + { + "epoch": 0.08718, + "grad_norm": 0.709911569034494, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 8718 + }, + { + "epoch": 0.08719, + "grad_norm": 0.6127044794162302, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 8719 + }, + { + "epoch": 0.0872, + "grad_norm": 0.5418597465972178, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 8720 + }, + { + "epoch": 0.08721, + "grad_norm": 0.545384052999374, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 8721 + }, + { + "epoch": 0.08722, + "grad_norm": 0.5854972859534596, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 8722 + }, + { + "epoch": 0.08723, + "grad_norm": 0.599752513559582, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 8723 + }, + { + "epoch": 0.08724, + "grad_norm": 0.5979573041484251, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 8724 + }, + { + "epoch": 0.08725, + "grad_norm": 0.6180900508607188, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 8725 + }, + { + "epoch": 0.08726, + "grad_norm": 0.6336982537847569, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 8726 + }, + { + "epoch": 0.08727, + "grad_norm": 0.7620135479461384, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 8727 + }, + { + "epoch": 0.08728, + "grad_norm": 0.8150934564907844, + "learning_rate": 0.003, + "loss": 4.063, + "step": 8728 + }, + { + "epoch": 0.08729, + "grad_norm": 0.6748250974417187, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 8729 + }, + { + "epoch": 0.0873, + "grad_norm": 0.5106549285627258, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 8730 + }, + { + "epoch": 0.08731, + "grad_norm": 0.616467259066863, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 8731 + }, + { + "epoch": 0.08732, + "grad_norm": 0.6409301951961746, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 8732 + }, + { + "epoch": 0.08733, + "grad_norm": 0.6531284409173848, + "learning_rate": 0.003, + "loss": 4.075, + "step": 8733 + }, + { + "epoch": 0.08734, + "grad_norm": 0.6909519673379502, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 8734 + }, + { + "epoch": 0.08735, + "grad_norm": 0.6949538564230802, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 8735 + }, + { + "epoch": 0.08736, + "grad_norm": 0.6353533341447787, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 8736 + }, + { + "epoch": 0.08737, + "grad_norm": 0.5766041576549661, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 8737 + }, + { + "epoch": 0.08738, + "grad_norm": 0.5636229267831031, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 8738 + }, + { + "epoch": 0.08739, + "grad_norm": 0.5714257910724869, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 8739 + }, + { + "epoch": 0.0874, + "grad_norm": 0.6664233704269024, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 8740 + }, + { + "epoch": 0.08741, + "grad_norm": 0.8499302285997681, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 8741 + }, + { + "epoch": 0.08742, + "grad_norm": 1.1223141440691569, + "learning_rate": 0.003, + "loss": 4.087, + "step": 8742 + }, + { + "epoch": 0.08743, + "grad_norm": 0.9050529775378228, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 8743 + }, + { + "epoch": 0.08744, + "grad_norm": 0.7951156037626225, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 8744 + }, + { + "epoch": 0.08745, + "grad_norm": 0.9924672009502984, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 8745 + }, + { + "epoch": 0.08746, + "grad_norm": 0.971675314061291, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 8746 + }, + { + "epoch": 0.08747, + "grad_norm": 0.9032139097191704, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 8747 + }, + { + "epoch": 0.08748, + "grad_norm": 0.798921343364292, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 8748 + }, + { + "epoch": 0.08749, + "grad_norm": 0.7958369681342767, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 8749 + }, + { + "epoch": 0.0875, + "grad_norm": 0.7880652528528818, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 8750 + }, + { + "epoch": 0.08751, + "grad_norm": 0.7117714704920254, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 8751 + }, + { + "epoch": 0.08752, + "grad_norm": 0.6564224414293092, + "learning_rate": 0.003, + "loss": 4.1211, + "step": 8752 + }, + { + "epoch": 0.08753, + "grad_norm": 0.6251613210169059, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 8753 + }, + { + "epoch": 0.08754, + "grad_norm": 0.6545076658857255, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 8754 + }, + { + "epoch": 0.08755, + "grad_norm": 0.6299656295806964, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 8755 + }, + { + "epoch": 0.08756, + "grad_norm": 0.6937544705015782, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 8756 + }, + { + "epoch": 0.08757, + "grad_norm": 0.7613436036720337, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 8757 + }, + { + "epoch": 0.08758, + "grad_norm": 0.7935810078864296, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 8758 + }, + { + "epoch": 0.08759, + "grad_norm": 0.9218245052230097, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 8759 + }, + { + "epoch": 0.0876, + "grad_norm": 1.1133184341136968, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 8760 + }, + { + "epoch": 0.08761, + "grad_norm": 0.8471729174022539, + "learning_rate": 0.003, + "loss": 4.1307, + "step": 8761 + }, + { + "epoch": 0.08762, + "grad_norm": 0.6885885491623273, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 8762 + }, + { + "epoch": 0.08763, + "grad_norm": 0.6900728124677, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 8763 + }, + { + "epoch": 0.08764, + "grad_norm": 0.7448709364689655, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 8764 + }, + { + "epoch": 0.08765, + "grad_norm": 0.7003442871930288, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 8765 + }, + { + "epoch": 0.08766, + "grad_norm": 0.695189743891509, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 8766 + }, + { + "epoch": 0.08767, + "grad_norm": 0.6561447154517275, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 8767 + }, + { + "epoch": 0.08768, + "grad_norm": 0.6134378072581433, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 8768 + }, + { + "epoch": 0.08769, + "grad_norm": 0.6152250798453555, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 8769 + }, + { + "epoch": 0.0877, + "grad_norm": 0.6450512433536177, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 8770 + }, + { + "epoch": 0.08771, + "grad_norm": 0.7339302655862884, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 8771 + }, + { + "epoch": 0.08772, + "grad_norm": 0.8938660355068361, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 8772 + }, + { + "epoch": 0.08773, + "grad_norm": 0.918539366218562, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 8773 + }, + { + "epoch": 0.08774, + "grad_norm": 0.8161914119839463, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 8774 + }, + { + "epoch": 0.08775, + "grad_norm": 0.7485358173137795, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 8775 + }, + { + "epoch": 0.08776, + "grad_norm": 0.6501840423824952, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 8776 + }, + { + "epoch": 0.08777, + "grad_norm": 0.6227297966691554, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 8777 + }, + { + "epoch": 0.08778, + "grad_norm": 0.6147681929920195, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 8778 + }, + { + "epoch": 0.08779, + "grad_norm": 0.6711211259228609, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 8779 + }, + { + "epoch": 0.0878, + "grad_norm": 0.6873812876268875, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 8780 + }, + { + "epoch": 0.08781, + "grad_norm": 0.7894739230084857, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 8781 + }, + { + "epoch": 0.08782, + "grad_norm": 0.9319206036015761, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 8782 + }, + { + "epoch": 0.08783, + "grad_norm": 0.785166946779399, + "learning_rate": 0.003, + "loss": 4.1055, + "step": 8783 + }, + { + "epoch": 0.08784, + "grad_norm": 0.6883174398483335, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 8784 + }, + { + "epoch": 0.08785, + "grad_norm": 0.7618236271622089, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 8785 + }, + { + "epoch": 0.08786, + "grad_norm": 0.7882058303319279, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 8786 + }, + { + "epoch": 0.08787, + "grad_norm": 0.8009435303009658, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 8787 + }, + { + "epoch": 0.08788, + "grad_norm": 0.8043157032401507, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 8788 + }, + { + "epoch": 0.08789, + "grad_norm": 0.8053019098728231, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 8789 + }, + { + "epoch": 0.0879, + "grad_norm": 0.7507680332954385, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 8790 + }, + { + "epoch": 0.08791, + "grad_norm": 0.742881908635139, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 8791 + }, + { + "epoch": 0.08792, + "grad_norm": 0.6619963633805601, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 8792 + }, + { + "epoch": 0.08793, + "grad_norm": 0.6836794010275485, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 8793 + }, + { + "epoch": 0.08794, + "grad_norm": 0.8299110382089389, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 8794 + }, + { + "epoch": 0.08795, + "grad_norm": 1.0468573387811748, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 8795 + }, + { + "epoch": 0.08796, + "grad_norm": 1.0109579043602372, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 8796 + }, + { + "epoch": 0.08797, + "grad_norm": 0.9354583933034025, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 8797 + }, + { + "epoch": 0.08798, + "grad_norm": 0.8801038141490901, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 8798 + }, + { + "epoch": 0.08799, + "grad_norm": 0.8373596391595058, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 8799 + }, + { + "epoch": 0.088, + "grad_norm": 0.8236058297216525, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 8800 + }, + { + "epoch": 0.08801, + "grad_norm": 0.9171639160570122, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 8801 + }, + { + "epoch": 0.08802, + "grad_norm": 0.950129352800799, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 8802 + }, + { + "epoch": 0.08803, + "grad_norm": 0.97417915794881, + "learning_rate": 0.003, + "loss": 4.087, + "step": 8803 + }, + { + "epoch": 0.08804, + "grad_norm": 0.9961244608205473, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 8804 + }, + { + "epoch": 0.08805, + "grad_norm": 0.8887568015028089, + "learning_rate": 0.003, + "loss": 4.085, + "step": 8805 + }, + { + "epoch": 0.08806, + "grad_norm": 0.8507811413672998, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 8806 + }, + { + "epoch": 0.08807, + "grad_norm": 0.7799651810021191, + "learning_rate": 0.003, + "loss": 4.114, + "step": 8807 + }, + { + "epoch": 0.08808, + "grad_norm": 0.6310859184139921, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 8808 + }, + { + "epoch": 0.08809, + "grad_norm": 0.5415343793054738, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 8809 + }, + { + "epoch": 0.0881, + "grad_norm": 0.5417480126046187, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 8810 + }, + { + "epoch": 0.08811, + "grad_norm": 0.5437548342900228, + "learning_rate": 0.003, + "loss": 4.091, + "step": 8811 + }, + { + "epoch": 0.08812, + "grad_norm": 0.5065008549086789, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 8812 + }, + { + "epoch": 0.08813, + "grad_norm": 0.44906733576252533, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 8813 + }, + { + "epoch": 0.08814, + "grad_norm": 0.4572901434044213, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 8814 + }, + { + "epoch": 0.08815, + "grad_norm": 0.39740682905597274, + "learning_rate": 0.003, + "loss": 4.096, + "step": 8815 + }, + { + "epoch": 0.08816, + "grad_norm": 0.41394278364223486, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 8816 + }, + { + "epoch": 0.08817, + "grad_norm": 0.40780800547558976, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 8817 + }, + { + "epoch": 0.08818, + "grad_norm": 0.45313876957887234, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 8818 + }, + { + "epoch": 0.08819, + "grad_norm": 0.47289697821709104, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 8819 + }, + { + "epoch": 0.0882, + "grad_norm": 0.5347028769974915, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 8820 + }, + { + "epoch": 0.08821, + "grad_norm": 0.6073667369132447, + "learning_rate": 0.003, + "loss": 4.1132, + "step": 8821 + }, + { + "epoch": 0.08822, + "grad_norm": 0.631563807802477, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 8822 + }, + { + "epoch": 0.08823, + "grad_norm": 0.8219598063684808, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 8823 + }, + { + "epoch": 0.08824, + "grad_norm": 1.1113151394557685, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 8824 + }, + { + "epoch": 0.08825, + "grad_norm": 0.9855650333902289, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 8825 + }, + { + "epoch": 0.08826, + "grad_norm": 0.9778239420914661, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 8826 + }, + { + "epoch": 0.08827, + "grad_norm": 0.8179910297155395, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 8827 + }, + { + "epoch": 0.08828, + "grad_norm": 0.7966604826205193, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 8828 + }, + { + "epoch": 0.08829, + "grad_norm": 0.7977026742349369, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 8829 + }, + { + "epoch": 0.0883, + "grad_norm": 0.792129459764214, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 8830 + }, + { + "epoch": 0.08831, + "grad_norm": 0.9326888184375913, + "learning_rate": 0.003, + "loss": 4.1008, + "step": 8831 + }, + { + "epoch": 0.08832, + "grad_norm": 0.866391958750175, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 8832 + }, + { + "epoch": 0.08833, + "grad_norm": 0.8717561470437578, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 8833 + }, + { + "epoch": 0.08834, + "grad_norm": 0.9086463815238154, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 8834 + }, + { + "epoch": 0.08835, + "grad_norm": 0.9982211117034512, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 8835 + }, + { + "epoch": 0.08836, + "grad_norm": 1.0116778830665036, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 8836 + }, + { + "epoch": 0.08837, + "grad_norm": 0.9047723332769526, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 8837 + }, + { + "epoch": 0.08838, + "grad_norm": 0.7218841347767363, + "learning_rate": 0.003, + "loss": 4.105, + "step": 8838 + }, + { + "epoch": 0.08839, + "grad_norm": 0.6219028863083907, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 8839 + }, + { + "epoch": 0.0884, + "grad_norm": 0.6370526447568182, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 8840 + }, + { + "epoch": 0.08841, + "grad_norm": 0.6865207786249934, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 8841 + }, + { + "epoch": 0.08842, + "grad_norm": 0.8079884245973484, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 8842 + }, + { + "epoch": 0.08843, + "grad_norm": 0.7716447791785704, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 8843 + }, + { + "epoch": 0.08844, + "grad_norm": 0.6552595716080165, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 8844 + }, + { + "epoch": 0.08845, + "grad_norm": 0.6411659090784259, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 8845 + }, + { + "epoch": 0.08846, + "grad_norm": 0.6345516705136226, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 8846 + }, + { + "epoch": 0.08847, + "grad_norm": 0.6208423879645917, + "learning_rate": 0.003, + "loss": 4.1138, + "step": 8847 + }, + { + "epoch": 0.08848, + "grad_norm": 0.6896343586488453, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 8848 + }, + { + "epoch": 0.08849, + "grad_norm": 0.6711212989454174, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 8849 + }, + { + "epoch": 0.0885, + "grad_norm": 0.787384336504839, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 8850 + }, + { + "epoch": 0.08851, + "grad_norm": 0.9296696469370899, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 8851 + }, + { + "epoch": 0.08852, + "grad_norm": 0.9491351647008734, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 8852 + }, + { + "epoch": 0.08853, + "grad_norm": 0.8823989114621453, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 8853 + }, + { + "epoch": 0.08854, + "grad_norm": 0.7356047162096173, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 8854 + }, + { + "epoch": 0.08855, + "grad_norm": 0.6817590094654521, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 8855 + }, + { + "epoch": 0.08856, + "grad_norm": 0.6432697273240098, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 8856 + }, + { + "epoch": 0.08857, + "grad_norm": 0.5912854441908766, + "learning_rate": 0.003, + "loss": 4.097, + "step": 8857 + }, + { + "epoch": 0.08858, + "grad_norm": 0.6032547637065733, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 8858 + }, + { + "epoch": 0.08859, + "grad_norm": 0.5317724895750301, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 8859 + }, + { + "epoch": 0.0886, + "grad_norm": 0.490974624643615, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 8860 + }, + { + "epoch": 0.08861, + "grad_norm": 0.5134567395667045, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 8861 + }, + { + "epoch": 0.08862, + "grad_norm": 0.563666635376977, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 8862 + }, + { + "epoch": 0.08863, + "grad_norm": 0.7032312250587909, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 8863 + }, + { + "epoch": 0.08864, + "grad_norm": 0.8431923998213133, + "learning_rate": 0.003, + "loss": 4.103, + "step": 8864 + }, + { + "epoch": 0.08865, + "grad_norm": 0.9517906387901386, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 8865 + }, + { + "epoch": 0.08866, + "grad_norm": 0.894832461876128, + "learning_rate": 0.003, + "loss": 4.073, + "step": 8866 + }, + { + "epoch": 0.08867, + "grad_norm": 0.7886402628779215, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 8867 + }, + { + "epoch": 0.08868, + "grad_norm": 0.7427477147931492, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 8868 + }, + { + "epoch": 0.08869, + "grad_norm": 0.7801814362733093, + "learning_rate": 0.003, + "loss": 4.14, + "step": 8869 + }, + { + "epoch": 0.0887, + "grad_norm": 0.9378074963893244, + "learning_rate": 0.003, + "loss": 4.079, + "step": 8870 + }, + { + "epoch": 0.08871, + "grad_norm": 1.0785126885762188, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 8871 + }, + { + "epoch": 0.08872, + "grad_norm": 0.9686509189429044, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 8872 + }, + { + "epoch": 0.08873, + "grad_norm": 0.9359552632213535, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 8873 + }, + { + "epoch": 0.08874, + "grad_norm": 0.801952593241515, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 8874 + }, + { + "epoch": 0.08875, + "grad_norm": 0.7300737081360735, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 8875 + }, + { + "epoch": 0.08876, + "grad_norm": 0.6845784358472665, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 8876 + }, + { + "epoch": 0.08877, + "grad_norm": 0.5992699690439727, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 8877 + }, + { + "epoch": 0.08878, + "grad_norm": 0.6987685567227716, + "learning_rate": 0.003, + "loss": 4.096, + "step": 8878 + }, + { + "epoch": 0.08879, + "grad_norm": 0.755918105586785, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 8879 + }, + { + "epoch": 0.0888, + "grad_norm": 0.804896715950834, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 8880 + }, + { + "epoch": 0.08881, + "grad_norm": 0.8597307396006292, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 8881 + }, + { + "epoch": 0.08882, + "grad_norm": 0.890705623678484, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 8882 + }, + { + "epoch": 0.08883, + "grad_norm": 0.87665671220314, + "learning_rate": 0.003, + "loss": 4.1507, + "step": 8883 + }, + { + "epoch": 0.08884, + "grad_norm": 0.9160504734206271, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 8884 + }, + { + "epoch": 0.08885, + "grad_norm": 0.9915450251876738, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 8885 + }, + { + "epoch": 0.08886, + "grad_norm": 0.9459675534923236, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 8886 + }, + { + "epoch": 0.08887, + "grad_norm": 0.8822638917509666, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 8887 + }, + { + "epoch": 0.08888, + "grad_norm": 0.9182248673162067, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 8888 + }, + { + "epoch": 0.08889, + "grad_norm": 0.8960795751997341, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 8889 + }, + { + "epoch": 0.0889, + "grad_norm": 0.7029348940189843, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 8890 + }, + { + "epoch": 0.08891, + "grad_norm": 0.5917270992086658, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 8891 + }, + { + "epoch": 0.08892, + "grad_norm": 0.5729686662161869, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 8892 + }, + { + "epoch": 0.08893, + "grad_norm": 0.4933602379272417, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 8893 + }, + { + "epoch": 0.08894, + "grad_norm": 0.4816956228568598, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 8894 + }, + { + "epoch": 0.08895, + "grad_norm": 0.5401089965190982, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 8895 + }, + { + "epoch": 0.08896, + "grad_norm": 0.6834674353769287, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 8896 + }, + { + "epoch": 0.08897, + "grad_norm": 0.9000006866364382, + "learning_rate": 0.003, + "loss": 4.082, + "step": 8897 + }, + { + "epoch": 0.08898, + "grad_norm": 1.0128344751677745, + "learning_rate": 0.003, + "loss": 4.108, + "step": 8898 + }, + { + "epoch": 0.08899, + "grad_norm": 0.8361225818664343, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 8899 + }, + { + "epoch": 0.089, + "grad_norm": 0.7604422714843287, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 8900 + }, + { + "epoch": 0.08901, + "grad_norm": 0.7721795341276967, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 8901 + }, + { + "epoch": 0.08902, + "grad_norm": 0.900896957082942, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 8902 + }, + { + "epoch": 0.08903, + "grad_norm": 0.8920547801921579, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 8903 + }, + { + "epoch": 0.08904, + "grad_norm": 0.8420148885506412, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 8904 + }, + { + "epoch": 0.08905, + "grad_norm": 0.8570250538255739, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 8905 + }, + { + "epoch": 0.08906, + "grad_norm": 0.9205488642424341, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 8906 + }, + { + "epoch": 0.08907, + "grad_norm": 1.0405334047811572, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 8907 + }, + { + "epoch": 0.08908, + "grad_norm": 0.9468509956798719, + "learning_rate": 0.003, + "loss": 4.1323, + "step": 8908 + }, + { + "epoch": 0.08909, + "grad_norm": 0.6926689852243239, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 8909 + }, + { + "epoch": 0.0891, + "grad_norm": 0.673009652964538, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 8910 + }, + { + "epoch": 0.08911, + "grad_norm": 0.6764278448657347, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 8911 + }, + { + "epoch": 0.08912, + "grad_norm": 0.713765564163065, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 8912 + }, + { + "epoch": 0.08913, + "grad_norm": 0.6427371366529472, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 8913 + }, + { + "epoch": 0.08914, + "grad_norm": 0.581053838640892, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 8914 + }, + { + "epoch": 0.08915, + "grad_norm": 0.6154512813452391, + "learning_rate": 0.003, + "loss": 4.1121, + "step": 8915 + }, + { + "epoch": 0.08916, + "grad_norm": 0.5811153917294403, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 8916 + }, + { + "epoch": 0.08917, + "grad_norm": 0.5826469378007894, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 8917 + }, + { + "epoch": 0.08918, + "grad_norm": 0.659868224790148, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 8918 + }, + { + "epoch": 0.08919, + "grad_norm": 0.7475809786764346, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 8919 + }, + { + "epoch": 0.0892, + "grad_norm": 0.8085506843447738, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 8920 + }, + { + "epoch": 0.08921, + "grad_norm": 0.8851417211555018, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 8921 + }, + { + "epoch": 0.08922, + "grad_norm": 0.8405802565396343, + "learning_rate": 0.003, + "loss": 4.11, + "step": 8922 + }, + { + "epoch": 0.08923, + "grad_norm": 0.7493224650962431, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 8923 + }, + { + "epoch": 0.08924, + "grad_norm": 0.6524585129634565, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 8924 + }, + { + "epoch": 0.08925, + "grad_norm": 0.6041119003990524, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 8925 + }, + { + "epoch": 0.08926, + "grad_norm": 0.6170325812343832, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 8926 + }, + { + "epoch": 0.08927, + "grad_norm": 0.5943880002209878, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 8927 + }, + { + "epoch": 0.08928, + "grad_norm": 0.6087426636265381, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 8928 + }, + { + "epoch": 0.08929, + "grad_norm": 0.6534682552946309, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 8929 + }, + { + "epoch": 0.0893, + "grad_norm": 0.7123983106901958, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 8930 + }, + { + "epoch": 0.08931, + "grad_norm": 0.6560939231760763, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 8931 + }, + { + "epoch": 0.08932, + "grad_norm": 0.5027522782217787, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 8932 + }, + { + "epoch": 0.08933, + "grad_norm": 0.4365213401530024, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 8933 + }, + { + "epoch": 0.08934, + "grad_norm": 0.5000766612819153, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 8934 + }, + { + "epoch": 0.08935, + "grad_norm": 0.5952483131640884, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 8935 + }, + { + "epoch": 0.08936, + "grad_norm": 0.6156724481814724, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 8936 + }, + { + "epoch": 0.08937, + "grad_norm": 0.5931999695378637, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 8937 + }, + { + "epoch": 0.08938, + "grad_norm": 0.6894161685221871, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 8938 + }, + { + "epoch": 0.08939, + "grad_norm": 0.7627353264346703, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 8939 + }, + { + "epoch": 0.0894, + "grad_norm": 0.7356138960546812, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 8940 + }, + { + "epoch": 0.08941, + "grad_norm": 0.6420508209011688, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 8941 + }, + { + "epoch": 0.08942, + "grad_norm": 0.6518833670204465, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 8942 + }, + { + "epoch": 0.08943, + "grad_norm": 0.6146219772409397, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 8943 + }, + { + "epoch": 0.08944, + "grad_norm": 0.6396512737329096, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 8944 + }, + { + "epoch": 0.08945, + "grad_norm": 0.6374698041220779, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 8945 + }, + { + "epoch": 0.08946, + "grad_norm": 0.7068156644089753, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 8946 + }, + { + "epoch": 0.08947, + "grad_norm": 0.845317356111474, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 8947 + }, + { + "epoch": 0.08948, + "grad_norm": 1.0368306661318563, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 8948 + }, + { + "epoch": 0.08949, + "grad_norm": 1.198555666195333, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 8949 + }, + { + "epoch": 0.0895, + "grad_norm": 0.5996385514208301, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 8950 + }, + { + "epoch": 0.08951, + "grad_norm": 0.7108449111037725, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 8951 + }, + { + "epoch": 0.08952, + "grad_norm": 0.855830753478774, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 8952 + }, + { + "epoch": 0.08953, + "grad_norm": 0.9206337595900704, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 8953 + }, + { + "epoch": 0.08954, + "grad_norm": 0.812196407142358, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 8954 + }, + { + "epoch": 0.08955, + "grad_norm": 0.7982919359283707, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 8955 + }, + { + "epoch": 0.08956, + "grad_norm": 0.8635629919504556, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 8956 + }, + { + "epoch": 0.08957, + "grad_norm": 0.9322894837024904, + "learning_rate": 0.003, + "loss": 4.118, + "step": 8957 + }, + { + "epoch": 0.08958, + "grad_norm": 0.9187549703391346, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 8958 + }, + { + "epoch": 0.08959, + "grad_norm": 0.9062695515862552, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 8959 + }, + { + "epoch": 0.0896, + "grad_norm": 0.8138333486268774, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 8960 + }, + { + "epoch": 0.08961, + "grad_norm": 0.8118676867241735, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 8961 + }, + { + "epoch": 0.08962, + "grad_norm": 0.915869089395175, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 8962 + }, + { + "epoch": 0.08963, + "grad_norm": 1.105783806141911, + "learning_rate": 0.003, + "loss": 4.1452, + "step": 8963 + }, + { + "epoch": 0.08964, + "grad_norm": 0.9871252968891374, + "learning_rate": 0.003, + "loss": 4.1247, + "step": 8964 + }, + { + "epoch": 0.08965, + "grad_norm": 0.9621593755289617, + "learning_rate": 0.003, + "loss": 4.1319, + "step": 8965 + }, + { + "epoch": 0.08966, + "grad_norm": 0.8845068329445493, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 8966 + }, + { + "epoch": 0.08967, + "grad_norm": 0.9262452245654436, + "learning_rate": 0.003, + "loss": 4.1443, + "step": 8967 + }, + { + "epoch": 0.08968, + "grad_norm": 1.0690148845058771, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 8968 + }, + { + "epoch": 0.08969, + "grad_norm": 0.9874560303810536, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 8969 + }, + { + "epoch": 0.0897, + "grad_norm": 1.1933287056871495, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 8970 + }, + { + "epoch": 0.08971, + "grad_norm": 0.83496478128569, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 8971 + }, + { + "epoch": 0.08972, + "grad_norm": 0.8851270091462285, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 8972 + }, + { + "epoch": 0.08973, + "grad_norm": 0.7361881327430853, + "learning_rate": 0.003, + "loss": 4.139, + "step": 8973 + }, + { + "epoch": 0.08974, + "grad_norm": 0.6947872113441466, + "learning_rate": 0.003, + "loss": 4.1308, + "step": 8974 + }, + { + "epoch": 0.08975, + "grad_norm": 0.6448548962054766, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 8975 + }, + { + "epoch": 0.08976, + "grad_norm": 0.6115477618993307, + "learning_rate": 0.003, + "loss": 4.12, + "step": 8976 + }, + { + "epoch": 0.08977, + "grad_norm": 0.6406580925867694, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 8977 + }, + { + "epoch": 0.08978, + "grad_norm": 0.5844796689209106, + "learning_rate": 0.003, + "loss": 4.1068, + "step": 8978 + }, + { + "epoch": 0.08979, + "grad_norm": 0.4899670091468122, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 8979 + }, + { + "epoch": 0.0898, + "grad_norm": 0.45907274050387364, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 8980 + }, + { + "epoch": 0.08981, + "grad_norm": 0.4787661572615123, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 8981 + }, + { + "epoch": 0.08982, + "grad_norm": 0.5314004944865418, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 8982 + }, + { + "epoch": 0.08983, + "grad_norm": 0.5367270316206084, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 8983 + }, + { + "epoch": 0.08984, + "grad_norm": 0.6099160821198948, + "learning_rate": 0.003, + "loss": 4.111, + "step": 8984 + }, + { + "epoch": 0.08985, + "grad_norm": 0.6151453422803892, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 8985 + }, + { + "epoch": 0.08986, + "grad_norm": 0.6431362909917693, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 8986 + }, + { + "epoch": 0.08987, + "grad_norm": 0.6245835812002626, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 8987 + }, + { + "epoch": 0.08988, + "grad_norm": 0.65684747848799, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 8988 + }, + { + "epoch": 0.08989, + "grad_norm": 0.7624170463908312, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 8989 + }, + { + "epoch": 0.0899, + "grad_norm": 0.7803016187618669, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 8990 + }, + { + "epoch": 0.08991, + "grad_norm": 0.6199387610169692, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 8991 + }, + { + "epoch": 0.08992, + "grad_norm": 0.6328852031482564, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 8992 + }, + { + "epoch": 0.08993, + "grad_norm": 0.6444379781150428, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 8993 + }, + { + "epoch": 0.08994, + "grad_norm": 0.7004376193465449, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 8994 + }, + { + "epoch": 0.08995, + "grad_norm": 0.7021581496420412, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 8995 + }, + { + "epoch": 0.08996, + "grad_norm": 0.735896084022374, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 8996 + }, + { + "epoch": 0.08997, + "grad_norm": 0.7654208285833823, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 8997 + }, + { + "epoch": 0.08998, + "grad_norm": 0.8110039047646157, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 8998 + }, + { + "epoch": 0.08999, + "grad_norm": 0.9354160143582295, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 8999 + }, + { + "epoch": 0.09, + "grad_norm": 0.9976774961172428, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 9000 + }, + { + "epoch": 0.09001, + "grad_norm": 0.9736972254727577, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 9001 + }, + { + "epoch": 0.09002, + "grad_norm": 0.9273588401300575, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 9002 + }, + { + "epoch": 0.09003, + "grad_norm": 0.9369574420893962, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 9003 + }, + { + "epoch": 0.09004, + "grad_norm": 0.8974590725426277, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 9004 + }, + { + "epoch": 0.09005, + "grad_norm": 0.9593522302258589, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 9005 + }, + { + "epoch": 0.09006, + "grad_norm": 1.148323754626388, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 9006 + }, + { + "epoch": 0.09007, + "grad_norm": 0.7314690534751829, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 9007 + }, + { + "epoch": 0.09008, + "grad_norm": 0.7435332389856724, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 9008 + }, + { + "epoch": 0.09009, + "grad_norm": 0.7078932397563384, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 9009 + }, + { + "epoch": 0.0901, + "grad_norm": 0.7615661108195854, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 9010 + }, + { + "epoch": 0.09011, + "grad_norm": 0.6793695558724677, + "learning_rate": 0.003, + "loss": 4.093, + "step": 9011 + }, + { + "epoch": 0.09012, + "grad_norm": 0.5832604883899358, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 9012 + }, + { + "epoch": 0.09013, + "grad_norm": 0.6009269936700211, + "learning_rate": 0.003, + "loss": 4.081, + "step": 9013 + }, + { + "epoch": 0.09014, + "grad_norm": 0.5849630009375443, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 9014 + }, + { + "epoch": 0.09015, + "grad_norm": 0.6889294107758397, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 9015 + }, + { + "epoch": 0.09016, + "grad_norm": 0.7296383845886047, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 9016 + }, + { + "epoch": 0.09017, + "grad_norm": 0.6317455085918721, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 9017 + }, + { + "epoch": 0.09018, + "grad_norm": 0.6094236570193985, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 9018 + }, + { + "epoch": 0.09019, + "grad_norm": 0.6782840216217128, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 9019 + }, + { + "epoch": 0.0902, + "grad_norm": 0.7866077854724068, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 9020 + }, + { + "epoch": 0.09021, + "grad_norm": 0.9385063075453379, + "learning_rate": 0.003, + "loss": 4.1068, + "step": 9021 + }, + { + "epoch": 0.09022, + "grad_norm": 1.0620409463514187, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 9022 + }, + { + "epoch": 0.09023, + "grad_norm": 0.7658741306051069, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 9023 + }, + { + "epoch": 0.09024, + "grad_norm": 0.6814552110746341, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 9024 + }, + { + "epoch": 0.09025, + "grad_norm": 0.7730019678384572, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 9025 + }, + { + "epoch": 0.09026, + "grad_norm": 0.7526237882680384, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 9026 + }, + { + "epoch": 0.09027, + "grad_norm": 0.6507122800031654, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 9027 + }, + { + "epoch": 0.09028, + "grad_norm": 0.6004305708920464, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 9028 + }, + { + "epoch": 0.09029, + "grad_norm": 0.524677033109727, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 9029 + }, + { + "epoch": 0.0903, + "grad_norm": 0.6033650045330714, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 9030 + }, + { + "epoch": 0.09031, + "grad_norm": 0.5871602565860976, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 9031 + }, + { + "epoch": 0.09032, + "grad_norm": 0.6156339772467885, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 9032 + }, + { + "epoch": 0.09033, + "grad_norm": 0.5735688885353999, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 9033 + }, + { + "epoch": 0.09034, + "grad_norm": 0.5650782637932048, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 9034 + }, + { + "epoch": 0.09035, + "grad_norm": 0.6052372993053132, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 9035 + }, + { + "epoch": 0.09036, + "grad_norm": 0.5476010632727801, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 9036 + }, + { + "epoch": 0.09037, + "grad_norm": 0.583873535910786, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 9037 + }, + { + "epoch": 0.09038, + "grad_norm": 0.7287923974736967, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 9038 + }, + { + "epoch": 0.09039, + "grad_norm": 0.9144241018735565, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 9039 + }, + { + "epoch": 0.0904, + "grad_norm": 1.0063720204208155, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 9040 + }, + { + "epoch": 0.09041, + "grad_norm": 0.9856961456597553, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 9041 + }, + { + "epoch": 0.09042, + "grad_norm": 0.854851014929688, + "learning_rate": 0.003, + "loss": 4.079, + "step": 9042 + }, + { + "epoch": 0.09043, + "grad_norm": 0.888431844356114, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 9043 + }, + { + "epoch": 0.09044, + "grad_norm": 1.017236877807041, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 9044 + }, + { + "epoch": 0.09045, + "grad_norm": 1.031807206439807, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 9045 + }, + { + "epoch": 0.09046, + "grad_norm": 0.85682286882739, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 9046 + }, + { + "epoch": 0.09047, + "grad_norm": 0.879826141603597, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 9047 + }, + { + "epoch": 0.09048, + "grad_norm": 0.8831965636813381, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 9048 + }, + { + "epoch": 0.09049, + "grad_norm": 0.9392095074031075, + "learning_rate": 0.003, + "loss": 4.1268, + "step": 9049 + }, + { + "epoch": 0.0905, + "grad_norm": 0.7828170543785254, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 9050 + }, + { + "epoch": 0.09051, + "grad_norm": 0.8339468015119822, + "learning_rate": 0.003, + "loss": 4.094, + "step": 9051 + }, + { + "epoch": 0.09052, + "grad_norm": 1.0336225007587214, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 9052 + }, + { + "epoch": 0.09053, + "grad_norm": 1.1758495676250011, + "learning_rate": 0.003, + "loss": 4.1068, + "step": 9053 + }, + { + "epoch": 0.09054, + "grad_norm": 0.6902237254747, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 9054 + }, + { + "epoch": 0.09055, + "grad_norm": 0.6385588548368929, + "learning_rate": 0.003, + "loss": 4.08, + "step": 9055 + }, + { + "epoch": 0.09056, + "grad_norm": 0.5682201761251565, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 9056 + }, + { + "epoch": 0.09057, + "grad_norm": 0.6058063245223638, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 9057 + }, + { + "epoch": 0.09058, + "grad_norm": 0.6296895298242686, + "learning_rate": 0.003, + "loss": 4.116, + "step": 9058 + }, + { + "epoch": 0.09059, + "grad_norm": 0.6548379189063352, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 9059 + }, + { + "epoch": 0.0906, + "grad_norm": 0.556996825598181, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 9060 + }, + { + "epoch": 0.09061, + "grad_norm": 0.521158274387155, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 9061 + }, + { + "epoch": 0.09062, + "grad_norm": 0.49652995554371476, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 9062 + }, + { + "epoch": 0.09063, + "grad_norm": 0.4672371999627178, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 9063 + }, + { + "epoch": 0.09064, + "grad_norm": 0.39411015877585076, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 9064 + }, + { + "epoch": 0.09065, + "grad_norm": 0.4138073282094934, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 9065 + }, + { + "epoch": 0.09066, + "grad_norm": 0.4761901801806996, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 9066 + }, + { + "epoch": 0.09067, + "grad_norm": 0.529331487923414, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 9067 + }, + { + "epoch": 0.09068, + "grad_norm": 0.5533215938891374, + "learning_rate": 0.003, + "loss": 4.088, + "step": 9068 + }, + { + "epoch": 0.09069, + "grad_norm": 0.6302304828764494, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 9069 + }, + { + "epoch": 0.0907, + "grad_norm": 0.7733370160113519, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 9070 + }, + { + "epoch": 0.09071, + "grad_norm": 0.9201950339828795, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 9071 + }, + { + "epoch": 0.09072, + "grad_norm": 1.0000279216059271, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 9072 + }, + { + "epoch": 0.09073, + "grad_norm": 0.9703960584155082, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 9073 + }, + { + "epoch": 0.09074, + "grad_norm": 0.7378599237261866, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 9074 + }, + { + "epoch": 0.09075, + "grad_norm": 0.6334621773210859, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 9075 + }, + { + "epoch": 0.09076, + "grad_norm": 0.7233595328453737, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 9076 + }, + { + "epoch": 0.09077, + "grad_norm": 0.6798870748945902, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 9077 + }, + { + "epoch": 0.09078, + "grad_norm": 0.6666031376042267, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 9078 + }, + { + "epoch": 0.09079, + "grad_norm": 0.8196363695329622, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 9079 + }, + { + "epoch": 0.0908, + "grad_norm": 0.8369563991262994, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 9080 + }, + { + "epoch": 0.09081, + "grad_norm": 0.7879366661568333, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 9081 + }, + { + "epoch": 0.09082, + "grad_norm": 0.9098587816411382, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 9082 + }, + { + "epoch": 0.09083, + "grad_norm": 0.9134982635005068, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 9083 + }, + { + "epoch": 0.09084, + "grad_norm": 0.9505568276434497, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 9084 + }, + { + "epoch": 0.09085, + "grad_norm": 0.8638717507088994, + "learning_rate": 0.003, + "loss": 4.116, + "step": 9085 + }, + { + "epoch": 0.09086, + "grad_norm": 0.8895115781730211, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 9086 + }, + { + "epoch": 0.09087, + "grad_norm": 1.060170360539398, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 9087 + }, + { + "epoch": 0.09088, + "grad_norm": 0.901927345203865, + "learning_rate": 0.003, + "loss": 4.091, + "step": 9088 + }, + { + "epoch": 0.09089, + "grad_norm": 0.785289100063639, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 9089 + }, + { + "epoch": 0.0909, + "grad_norm": 0.8620215806982862, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 9090 + }, + { + "epoch": 0.09091, + "grad_norm": 0.7979739519204804, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 9091 + }, + { + "epoch": 0.09092, + "grad_norm": 0.8103723620422554, + "learning_rate": 0.003, + "loss": 4.1311, + "step": 9092 + }, + { + "epoch": 0.09093, + "grad_norm": 0.7761450446921329, + "learning_rate": 0.003, + "loss": 4.089, + "step": 9093 + }, + { + "epoch": 0.09094, + "grad_norm": 0.7568316928415706, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 9094 + }, + { + "epoch": 0.09095, + "grad_norm": 0.759347481387229, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 9095 + }, + { + "epoch": 0.09096, + "grad_norm": 0.766744250041468, + "learning_rate": 0.003, + "loss": 4.087, + "step": 9096 + }, + { + "epoch": 0.09097, + "grad_norm": 0.7055484847046865, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 9097 + }, + { + "epoch": 0.09098, + "grad_norm": 0.7249870657972629, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 9098 + }, + { + "epoch": 0.09099, + "grad_norm": 0.794584702637224, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 9099 + }, + { + "epoch": 0.091, + "grad_norm": 0.9292342497012281, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 9100 + }, + { + "epoch": 0.09101, + "grad_norm": 1.1151713569446484, + "learning_rate": 0.003, + "loss": 4.107, + "step": 9101 + }, + { + "epoch": 0.09102, + "grad_norm": 0.9856776803625353, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 9102 + }, + { + "epoch": 0.09103, + "grad_norm": 0.8883418202472645, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 9103 + }, + { + "epoch": 0.09104, + "grad_norm": 0.7209861355720855, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 9104 + }, + { + "epoch": 0.09105, + "grad_norm": 0.6974273614545563, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 9105 + }, + { + "epoch": 0.09106, + "grad_norm": 0.7573369118352933, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 9106 + }, + { + "epoch": 0.09107, + "grad_norm": 0.8573483561968565, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 9107 + }, + { + "epoch": 0.09108, + "grad_norm": 0.843416858845127, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 9108 + }, + { + "epoch": 0.09109, + "grad_norm": 0.7331572974610379, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 9109 + }, + { + "epoch": 0.0911, + "grad_norm": 0.6866482073169092, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 9110 + }, + { + "epoch": 0.09111, + "grad_norm": 0.6146584810372685, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 9111 + }, + { + "epoch": 0.09112, + "grad_norm": 0.6233531858518306, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 9112 + }, + { + "epoch": 0.09113, + "grad_norm": 0.6630440885467346, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 9113 + }, + { + "epoch": 0.09114, + "grad_norm": 0.6675611226061425, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 9114 + }, + { + "epoch": 0.09115, + "grad_norm": 0.6851264099217901, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 9115 + }, + { + "epoch": 0.09116, + "grad_norm": 0.6743396804700748, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 9116 + }, + { + "epoch": 0.09117, + "grad_norm": 0.6349134476822885, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 9117 + }, + { + "epoch": 0.09118, + "grad_norm": 0.5734871976924563, + "learning_rate": 0.003, + "loss": 4.087, + "step": 9118 + }, + { + "epoch": 0.09119, + "grad_norm": 0.5722066825396821, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 9119 + }, + { + "epoch": 0.0912, + "grad_norm": 0.5747337761394117, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 9120 + }, + { + "epoch": 0.09121, + "grad_norm": 0.5703997923017384, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 9121 + }, + { + "epoch": 0.09122, + "grad_norm": 0.5937813885823265, + "learning_rate": 0.003, + "loss": 4.104, + "step": 9122 + }, + { + "epoch": 0.09123, + "grad_norm": 0.5793339846399808, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 9123 + }, + { + "epoch": 0.09124, + "grad_norm": 0.7079624241350131, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 9124 + }, + { + "epoch": 0.09125, + "grad_norm": 0.9363241418641047, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 9125 + }, + { + "epoch": 0.09126, + "grad_norm": 1.123535275176799, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 9126 + }, + { + "epoch": 0.09127, + "grad_norm": 0.9098450109303394, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 9127 + }, + { + "epoch": 0.09128, + "grad_norm": 0.7695802519035674, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 9128 + }, + { + "epoch": 0.09129, + "grad_norm": 0.6725773758279523, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 9129 + }, + { + "epoch": 0.0913, + "grad_norm": 0.7516703750695625, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 9130 + }, + { + "epoch": 0.09131, + "grad_norm": 0.7232356676372639, + "learning_rate": 0.003, + "loss": 4.078, + "step": 9131 + }, + { + "epoch": 0.09132, + "grad_norm": 0.6859783867977578, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 9132 + }, + { + "epoch": 0.09133, + "grad_norm": 0.6441924486606353, + "learning_rate": 0.003, + "loss": 4.084, + "step": 9133 + }, + { + "epoch": 0.09134, + "grad_norm": 0.655918695866971, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 9134 + }, + { + "epoch": 0.09135, + "grad_norm": 0.7860018696069528, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 9135 + }, + { + "epoch": 0.09136, + "grad_norm": 0.901439083024281, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 9136 + }, + { + "epoch": 0.09137, + "grad_norm": 1.0327495887186917, + "learning_rate": 0.003, + "loss": 4.128, + "step": 9137 + }, + { + "epoch": 0.09138, + "grad_norm": 0.9559641464369752, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 9138 + }, + { + "epoch": 0.09139, + "grad_norm": 0.8021385645026318, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 9139 + }, + { + "epoch": 0.0914, + "grad_norm": 0.6983879716817356, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 9140 + }, + { + "epoch": 0.09141, + "grad_norm": 0.6229466794575317, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 9141 + }, + { + "epoch": 0.09142, + "grad_norm": 0.5394263568511116, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 9142 + }, + { + "epoch": 0.09143, + "grad_norm": 0.5799460322868719, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 9143 + }, + { + "epoch": 0.09144, + "grad_norm": 0.7115794520642408, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 9144 + }, + { + "epoch": 0.09145, + "grad_norm": 0.9250322234700112, + "learning_rate": 0.003, + "loss": 4.1122, + "step": 9145 + }, + { + "epoch": 0.09146, + "grad_norm": 1.1493765993374974, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 9146 + }, + { + "epoch": 0.09147, + "grad_norm": 0.6766953092233843, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 9147 + }, + { + "epoch": 0.09148, + "grad_norm": 0.615481970405836, + "learning_rate": 0.003, + "loss": 4.102, + "step": 9148 + }, + { + "epoch": 0.09149, + "grad_norm": 0.6468342965420243, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 9149 + }, + { + "epoch": 0.0915, + "grad_norm": 0.5695495986990226, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 9150 + }, + { + "epoch": 0.09151, + "grad_norm": 0.6322625762493587, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 9151 + }, + { + "epoch": 0.09152, + "grad_norm": 0.7095634250401223, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 9152 + }, + { + "epoch": 0.09153, + "grad_norm": 0.7562377869290767, + "learning_rate": 0.003, + "loss": 4.092, + "step": 9153 + }, + { + "epoch": 0.09154, + "grad_norm": 0.730100776007017, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 9154 + }, + { + "epoch": 0.09155, + "grad_norm": 0.6476046997499159, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 9155 + }, + { + "epoch": 0.09156, + "grad_norm": 0.5966611216224073, + "learning_rate": 0.003, + "loss": 4.092, + "step": 9156 + }, + { + "epoch": 0.09157, + "grad_norm": 0.729366774751956, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 9157 + }, + { + "epoch": 0.09158, + "grad_norm": 0.9225886030022502, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 9158 + }, + { + "epoch": 0.09159, + "grad_norm": 1.0003949696131078, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 9159 + }, + { + "epoch": 0.0916, + "grad_norm": 0.8838504831163226, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 9160 + }, + { + "epoch": 0.09161, + "grad_norm": 1.0435641927134112, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 9161 + }, + { + "epoch": 0.09162, + "grad_norm": 0.9849828677019289, + "learning_rate": 0.003, + "loss": 4.131, + "step": 9162 + }, + { + "epoch": 0.09163, + "grad_norm": 0.8423932570011732, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 9163 + }, + { + "epoch": 0.09164, + "grad_norm": 0.8118112808914887, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 9164 + }, + { + "epoch": 0.09165, + "grad_norm": 0.696647926319978, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 9165 + }, + { + "epoch": 0.09166, + "grad_norm": 0.7050748772822608, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 9166 + }, + { + "epoch": 0.09167, + "grad_norm": 0.6984810250947892, + "learning_rate": 0.003, + "loss": 4.082, + "step": 9167 + }, + { + "epoch": 0.09168, + "grad_norm": 0.7429002727610159, + "learning_rate": 0.003, + "loss": 4.075, + "step": 9168 + }, + { + "epoch": 0.09169, + "grad_norm": 0.8700828623089742, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 9169 + }, + { + "epoch": 0.0917, + "grad_norm": 0.8255131562046868, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 9170 + }, + { + "epoch": 0.09171, + "grad_norm": 0.8257708158169661, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 9171 + }, + { + "epoch": 0.09172, + "grad_norm": 0.9821171250112688, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 9172 + }, + { + "epoch": 0.09173, + "grad_norm": 1.1101867989693224, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 9173 + }, + { + "epoch": 0.09174, + "grad_norm": 0.8606212362625643, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 9174 + }, + { + "epoch": 0.09175, + "grad_norm": 0.7238484783389175, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 9175 + }, + { + "epoch": 0.09176, + "grad_norm": 0.7778471274442456, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 9176 + }, + { + "epoch": 0.09177, + "grad_norm": 0.7942453477102529, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 9177 + }, + { + "epoch": 0.09178, + "grad_norm": 0.6875439916755981, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 9178 + }, + { + "epoch": 0.09179, + "grad_norm": 0.7054405486618374, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 9179 + }, + { + "epoch": 0.0918, + "grad_norm": 0.7085929286496793, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 9180 + }, + { + "epoch": 0.09181, + "grad_norm": 0.6868386510039214, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 9181 + }, + { + "epoch": 0.09182, + "grad_norm": 0.693686693395854, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 9182 + }, + { + "epoch": 0.09183, + "grad_norm": 0.6969673134627956, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 9183 + }, + { + "epoch": 0.09184, + "grad_norm": 0.6607879678388875, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 9184 + }, + { + "epoch": 0.09185, + "grad_norm": 0.6097277133504103, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 9185 + }, + { + "epoch": 0.09186, + "grad_norm": 0.6707275075667879, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 9186 + }, + { + "epoch": 0.09187, + "grad_norm": 0.631051696187288, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 9187 + }, + { + "epoch": 0.09188, + "grad_norm": 0.5850735017399692, + "learning_rate": 0.003, + "loss": 4.091, + "step": 9188 + }, + { + "epoch": 0.09189, + "grad_norm": 0.5909062131280334, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 9189 + }, + { + "epoch": 0.0919, + "grad_norm": 0.5804461230556179, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 9190 + }, + { + "epoch": 0.09191, + "grad_norm": 0.74233665537322, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 9191 + }, + { + "epoch": 0.09192, + "grad_norm": 1.0070740958090563, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 9192 + }, + { + "epoch": 0.09193, + "grad_norm": 1.3162433610093374, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 9193 + }, + { + "epoch": 0.09194, + "grad_norm": 0.4948613105868042, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 9194 + }, + { + "epoch": 0.09195, + "grad_norm": 0.683388327325886, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 9195 + }, + { + "epoch": 0.09196, + "grad_norm": 0.8065888189569945, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 9196 + }, + { + "epoch": 0.09197, + "grad_norm": 0.7584747106256147, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 9197 + }, + { + "epoch": 0.09198, + "grad_norm": 0.7753320297151186, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 9198 + }, + { + "epoch": 0.09199, + "grad_norm": 0.7824527448547508, + "learning_rate": 0.003, + "loss": 4.104, + "step": 9199 + }, + { + "epoch": 0.092, + "grad_norm": 0.7775469360725809, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 9200 + }, + { + "epoch": 0.09201, + "grad_norm": 0.7680517906058487, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 9201 + }, + { + "epoch": 0.09202, + "grad_norm": 0.8531435904229961, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 9202 + }, + { + "epoch": 0.09203, + "grad_norm": 0.8780606886961511, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 9203 + }, + { + "epoch": 0.09204, + "grad_norm": 0.7891522389637466, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 9204 + }, + { + "epoch": 0.09205, + "grad_norm": 0.7455931059386063, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 9205 + }, + { + "epoch": 0.09206, + "grad_norm": 0.7535666910826962, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 9206 + }, + { + "epoch": 0.09207, + "grad_norm": 0.6917320820801799, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 9207 + }, + { + "epoch": 0.09208, + "grad_norm": 0.704126699875504, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 9208 + }, + { + "epoch": 0.09209, + "grad_norm": 0.8523565990050219, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 9209 + }, + { + "epoch": 0.0921, + "grad_norm": 0.9685150489510521, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 9210 + }, + { + "epoch": 0.09211, + "grad_norm": 1.1138999636053162, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 9211 + }, + { + "epoch": 0.09212, + "grad_norm": 0.7870842232400849, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 9212 + }, + { + "epoch": 0.09213, + "grad_norm": 0.7697833258515777, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 9213 + }, + { + "epoch": 0.09214, + "grad_norm": 0.6846278043413059, + "learning_rate": 0.003, + "loss": 4.074, + "step": 9214 + }, + { + "epoch": 0.09215, + "grad_norm": 0.6743239347064265, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 9215 + }, + { + "epoch": 0.09216, + "grad_norm": 0.7010992201639914, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 9216 + }, + { + "epoch": 0.09217, + "grad_norm": 0.7476228942446806, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 9217 + }, + { + "epoch": 0.09218, + "grad_norm": 0.8348321650225655, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 9218 + }, + { + "epoch": 0.09219, + "grad_norm": 0.9157211023791693, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 9219 + }, + { + "epoch": 0.0922, + "grad_norm": 0.83049456838394, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 9220 + }, + { + "epoch": 0.09221, + "grad_norm": 0.6939065160104407, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 9221 + }, + { + "epoch": 0.09222, + "grad_norm": 0.692227128387877, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 9222 + }, + { + "epoch": 0.09223, + "grad_norm": 0.6206638261808557, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 9223 + }, + { + "epoch": 0.09224, + "grad_norm": 0.6155175720948994, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 9224 + }, + { + "epoch": 0.09225, + "grad_norm": 0.5763438884147025, + "learning_rate": 0.003, + "loss": 4.075, + "step": 9225 + }, + { + "epoch": 0.09226, + "grad_norm": 0.5792522714322638, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 9226 + }, + { + "epoch": 0.09227, + "grad_norm": 0.5698582837332627, + "learning_rate": 0.003, + "loss": 4.078, + "step": 9227 + }, + { + "epoch": 0.09228, + "grad_norm": 0.5554181806254156, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 9228 + }, + { + "epoch": 0.09229, + "grad_norm": 0.6496771344535306, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 9229 + }, + { + "epoch": 0.0923, + "grad_norm": 0.8079631630379661, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 9230 + }, + { + "epoch": 0.09231, + "grad_norm": 1.001327955726392, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 9231 + }, + { + "epoch": 0.09232, + "grad_norm": 1.0151152557949434, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 9232 + }, + { + "epoch": 0.09233, + "grad_norm": 0.7597476613283288, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 9233 + }, + { + "epoch": 0.09234, + "grad_norm": 0.7793623041860894, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 9234 + }, + { + "epoch": 0.09235, + "grad_norm": 0.8181572642649179, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 9235 + }, + { + "epoch": 0.09236, + "grad_norm": 0.7748209029826961, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 9236 + }, + { + "epoch": 0.09237, + "grad_norm": 0.7417587962678789, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 9237 + }, + { + "epoch": 0.09238, + "grad_norm": 0.7302829503235564, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 9238 + }, + { + "epoch": 0.09239, + "grad_norm": 0.8200299371821417, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 9239 + }, + { + "epoch": 0.0924, + "grad_norm": 0.7745701228788062, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 9240 + }, + { + "epoch": 0.09241, + "grad_norm": 0.8722251162417323, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 9241 + }, + { + "epoch": 0.09242, + "grad_norm": 0.890375969288489, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 9242 + }, + { + "epoch": 0.09243, + "grad_norm": 0.8425579306067554, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 9243 + }, + { + "epoch": 0.09244, + "grad_norm": 0.750374270081524, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 9244 + }, + { + "epoch": 0.09245, + "grad_norm": 0.7433474953602284, + "learning_rate": 0.003, + "loss": 4.079, + "step": 9245 + }, + { + "epoch": 0.09246, + "grad_norm": 0.8399736354443268, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 9246 + }, + { + "epoch": 0.09247, + "grad_norm": 0.8952276071336189, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 9247 + }, + { + "epoch": 0.09248, + "grad_norm": 0.9460058010349833, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 9248 + }, + { + "epoch": 0.09249, + "grad_norm": 1.028348281495935, + "learning_rate": 0.003, + "loss": 4.1543, + "step": 9249 + }, + { + "epoch": 0.0925, + "grad_norm": 0.8994505559170306, + "learning_rate": 0.003, + "loss": 4.121, + "step": 9250 + }, + { + "epoch": 0.09251, + "grad_norm": 0.8661162024839089, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 9251 + }, + { + "epoch": 0.09252, + "grad_norm": 0.7576653757537349, + "learning_rate": 0.003, + "loss": 4.1253, + "step": 9252 + }, + { + "epoch": 0.09253, + "grad_norm": 0.7300578749253667, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 9253 + }, + { + "epoch": 0.09254, + "grad_norm": 0.6655033765923661, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 9254 + }, + { + "epoch": 0.09255, + "grad_norm": 0.67210004385522, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 9255 + }, + { + "epoch": 0.09256, + "grad_norm": 0.6009374878800777, + "learning_rate": 0.003, + "loss": 4.098, + "step": 9256 + }, + { + "epoch": 0.09257, + "grad_norm": 0.5939860882674391, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 9257 + }, + { + "epoch": 0.09258, + "grad_norm": 0.6273131160536646, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 9258 + }, + { + "epoch": 0.09259, + "grad_norm": 0.6931906821491065, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 9259 + }, + { + "epoch": 0.0926, + "grad_norm": 0.6843836039174134, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 9260 + }, + { + "epoch": 0.09261, + "grad_norm": 0.664844852067105, + "learning_rate": 0.003, + "loss": 4.071, + "step": 9261 + }, + { + "epoch": 0.09262, + "grad_norm": 0.7149590284934766, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 9262 + }, + { + "epoch": 0.09263, + "grad_norm": 0.799686238491033, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 9263 + }, + { + "epoch": 0.09264, + "grad_norm": 0.9184087623202745, + "learning_rate": 0.003, + "loss": 4.1361, + "step": 9264 + }, + { + "epoch": 0.09265, + "grad_norm": 0.8917999340633649, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 9265 + }, + { + "epoch": 0.09266, + "grad_norm": 1.0690273464224147, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 9266 + }, + { + "epoch": 0.09267, + "grad_norm": 0.9224213714197611, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 9267 + }, + { + "epoch": 0.09268, + "grad_norm": 0.7819872181594957, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 9268 + }, + { + "epoch": 0.09269, + "grad_norm": 0.8903337895871032, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 9269 + }, + { + "epoch": 0.0927, + "grad_norm": 0.8789410698039444, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 9270 + }, + { + "epoch": 0.09271, + "grad_norm": 0.8151226774452855, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 9271 + }, + { + "epoch": 0.09272, + "grad_norm": 0.6687056930008045, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 9272 + }, + { + "epoch": 0.09273, + "grad_norm": 0.6975950766902023, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 9273 + }, + { + "epoch": 0.09274, + "grad_norm": 0.704466186613381, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 9274 + }, + { + "epoch": 0.09275, + "grad_norm": 0.7505976895896498, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 9275 + }, + { + "epoch": 0.09276, + "grad_norm": 0.7764606026959862, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 9276 + }, + { + "epoch": 0.09277, + "grad_norm": 0.635369630015743, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 9277 + }, + { + "epoch": 0.09278, + "grad_norm": 0.6241312910359976, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 9278 + }, + { + "epoch": 0.09279, + "grad_norm": 0.6872238724869792, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 9279 + }, + { + "epoch": 0.0928, + "grad_norm": 0.7388825536520087, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 9280 + }, + { + "epoch": 0.09281, + "grad_norm": 0.7389024774211659, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 9281 + }, + { + "epoch": 0.09282, + "grad_norm": 0.7721013112557448, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 9282 + }, + { + "epoch": 0.09283, + "grad_norm": 0.8336050701348083, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 9283 + }, + { + "epoch": 0.09284, + "grad_norm": 1.0158256883508041, + "learning_rate": 0.003, + "loss": 4.1008, + "step": 9284 + }, + { + "epoch": 0.09285, + "grad_norm": 1.18058053789904, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 9285 + }, + { + "epoch": 0.09286, + "grad_norm": 0.8343774515252792, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 9286 + }, + { + "epoch": 0.09287, + "grad_norm": 0.7308200337417591, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 9287 + }, + { + "epoch": 0.09288, + "grad_norm": 0.6891633376361983, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 9288 + }, + { + "epoch": 0.09289, + "grad_norm": 0.6029232592146974, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 9289 + }, + { + "epoch": 0.0929, + "grad_norm": 0.5626416862126429, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 9290 + }, + { + "epoch": 0.09291, + "grad_norm": 0.5779583984794753, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 9291 + }, + { + "epoch": 0.09292, + "grad_norm": 0.5276189332666765, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 9292 + }, + { + "epoch": 0.09293, + "grad_norm": 0.4601373246181229, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 9293 + }, + { + "epoch": 0.09294, + "grad_norm": 0.5858557148598581, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 9294 + }, + { + "epoch": 0.09295, + "grad_norm": 0.6570320858383965, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 9295 + }, + { + "epoch": 0.09296, + "grad_norm": 0.7058986690585867, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 9296 + }, + { + "epoch": 0.09297, + "grad_norm": 0.7237851928274343, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 9297 + }, + { + "epoch": 0.09298, + "grad_norm": 0.7244115678072608, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 9298 + }, + { + "epoch": 0.09299, + "grad_norm": 0.7023860071795822, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 9299 + }, + { + "epoch": 0.093, + "grad_norm": 0.6893885792882112, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 9300 + }, + { + "epoch": 0.09301, + "grad_norm": 0.7771637507723111, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 9301 + }, + { + "epoch": 0.09302, + "grad_norm": 0.8353111718797918, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 9302 + }, + { + "epoch": 0.09303, + "grad_norm": 0.9443868135718669, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 9303 + }, + { + "epoch": 0.09304, + "grad_norm": 0.9340284309555337, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 9304 + }, + { + "epoch": 0.09305, + "grad_norm": 0.821105987408757, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 9305 + }, + { + "epoch": 0.09306, + "grad_norm": 0.6798841287919474, + "learning_rate": 0.003, + "loss": 4.088, + "step": 9306 + }, + { + "epoch": 0.09307, + "grad_norm": 0.6308321339032982, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 9307 + }, + { + "epoch": 0.09308, + "grad_norm": 0.6601870406757323, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 9308 + }, + { + "epoch": 0.09309, + "grad_norm": 0.6442557609343686, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 9309 + }, + { + "epoch": 0.0931, + "grad_norm": 0.6661782172293007, + "learning_rate": 0.003, + "loss": 4.1, + "step": 9310 + }, + { + "epoch": 0.09311, + "grad_norm": 0.694502121054022, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 9311 + }, + { + "epoch": 0.09312, + "grad_norm": 0.7404429863152544, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 9312 + }, + { + "epoch": 0.09313, + "grad_norm": 0.7933542321293575, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 9313 + }, + { + "epoch": 0.09314, + "grad_norm": 0.801634188665558, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 9314 + }, + { + "epoch": 0.09315, + "grad_norm": 0.8599424000518597, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 9315 + }, + { + "epoch": 0.09316, + "grad_norm": 0.8488962903419432, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 9316 + }, + { + "epoch": 0.09317, + "grad_norm": 0.8383529764637065, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 9317 + }, + { + "epoch": 0.09318, + "grad_norm": 0.8558462806350178, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 9318 + }, + { + "epoch": 0.09319, + "grad_norm": 0.9413535849970353, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 9319 + }, + { + "epoch": 0.0932, + "grad_norm": 1.2104907947773857, + "learning_rate": 0.003, + "loss": 4.1146, + "step": 9320 + }, + { + "epoch": 0.09321, + "grad_norm": 0.9438406966956778, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 9321 + }, + { + "epoch": 0.09322, + "grad_norm": 0.8654878853235997, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 9322 + }, + { + "epoch": 0.09323, + "grad_norm": 0.7716832268633257, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 9323 + }, + { + "epoch": 0.09324, + "grad_norm": 0.9620871505328507, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 9324 + }, + { + "epoch": 0.09325, + "grad_norm": 1.033201062271869, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 9325 + }, + { + "epoch": 0.09326, + "grad_norm": 0.9180758393766172, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 9326 + }, + { + "epoch": 0.09327, + "grad_norm": 0.9604068125223971, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 9327 + }, + { + "epoch": 0.09328, + "grad_norm": 0.8487522338913149, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 9328 + }, + { + "epoch": 0.09329, + "grad_norm": 0.7140038231334875, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 9329 + }, + { + "epoch": 0.0933, + "grad_norm": 0.6871213262271845, + "learning_rate": 0.003, + "loss": 4.1235, + "step": 9330 + }, + { + "epoch": 0.09331, + "grad_norm": 0.7546334768507528, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 9331 + }, + { + "epoch": 0.09332, + "grad_norm": 0.8370311645065874, + "learning_rate": 0.003, + "loss": 4.1162, + "step": 9332 + }, + { + "epoch": 0.09333, + "grad_norm": 1.1267395055615177, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 9333 + }, + { + "epoch": 0.09334, + "grad_norm": 0.8851656694755474, + "learning_rate": 0.003, + "loss": 4.119, + "step": 9334 + }, + { + "epoch": 0.09335, + "grad_norm": 0.6895247005069166, + "learning_rate": 0.003, + "loss": 4.097, + "step": 9335 + }, + { + "epoch": 0.09336, + "grad_norm": 0.7529887012918819, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 9336 + }, + { + "epoch": 0.09337, + "grad_norm": 0.8968776641201162, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 9337 + }, + { + "epoch": 0.09338, + "grad_norm": 0.969190767069404, + "learning_rate": 0.003, + "loss": 4.1145, + "step": 9338 + }, + { + "epoch": 0.09339, + "grad_norm": 0.9096354758398183, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 9339 + }, + { + "epoch": 0.0934, + "grad_norm": 1.0248240766167955, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 9340 + }, + { + "epoch": 0.09341, + "grad_norm": 0.8411462432609955, + "learning_rate": 0.003, + "loss": 4.107, + "step": 9341 + }, + { + "epoch": 0.09342, + "grad_norm": 0.7516218020852315, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 9342 + }, + { + "epoch": 0.09343, + "grad_norm": 0.8168975424156755, + "learning_rate": 0.003, + "loss": 4.101, + "step": 9343 + }, + { + "epoch": 0.09344, + "grad_norm": 0.8281071489261201, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 9344 + }, + { + "epoch": 0.09345, + "grad_norm": 0.7971176476434507, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 9345 + }, + { + "epoch": 0.09346, + "grad_norm": 0.7256703424187483, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 9346 + }, + { + "epoch": 0.09347, + "grad_norm": 0.625000462139122, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 9347 + }, + { + "epoch": 0.09348, + "grad_norm": 0.5650513642145435, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 9348 + }, + { + "epoch": 0.09349, + "grad_norm": 0.5637070822633384, + "learning_rate": 0.003, + "loss": 4.1034, + "step": 9349 + }, + { + "epoch": 0.0935, + "grad_norm": 0.5477607998488464, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 9350 + }, + { + "epoch": 0.09351, + "grad_norm": 0.5967568854844185, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 9351 + }, + { + "epoch": 0.09352, + "grad_norm": 0.6324408848296686, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 9352 + }, + { + "epoch": 0.09353, + "grad_norm": 0.5988050971610268, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 9353 + }, + { + "epoch": 0.09354, + "grad_norm": 0.6587621555860957, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 9354 + }, + { + "epoch": 0.09355, + "grad_norm": 0.6913770140531089, + "learning_rate": 0.003, + "loss": 4.104, + "step": 9355 + }, + { + "epoch": 0.09356, + "grad_norm": 0.8268308580950648, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 9356 + }, + { + "epoch": 0.09357, + "grad_norm": 0.9832141390606829, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 9357 + }, + { + "epoch": 0.09358, + "grad_norm": 1.0264443343215965, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 9358 + }, + { + "epoch": 0.09359, + "grad_norm": 0.8022400111008331, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 9359 + }, + { + "epoch": 0.0936, + "grad_norm": 0.7232160641932127, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 9360 + }, + { + "epoch": 0.09361, + "grad_norm": 0.8345708225298007, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 9361 + }, + { + "epoch": 0.09362, + "grad_norm": 0.864583150263912, + "learning_rate": 0.003, + "loss": 4.1319, + "step": 9362 + }, + { + "epoch": 0.09363, + "grad_norm": 0.8449596309061119, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 9363 + }, + { + "epoch": 0.09364, + "grad_norm": 0.721880849977252, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 9364 + }, + { + "epoch": 0.09365, + "grad_norm": 0.6721169386273598, + "learning_rate": 0.003, + "loss": 4.11, + "step": 9365 + }, + { + "epoch": 0.09366, + "grad_norm": 0.5176745603655055, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 9366 + }, + { + "epoch": 0.09367, + "grad_norm": 0.5663365559445198, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 9367 + }, + { + "epoch": 0.09368, + "grad_norm": 0.5588855629457294, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 9368 + }, + { + "epoch": 0.09369, + "grad_norm": 0.558318290254828, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 9369 + }, + { + "epoch": 0.0937, + "grad_norm": 0.5777087869339129, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 9370 + }, + { + "epoch": 0.09371, + "grad_norm": 0.6683550838053252, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 9371 + }, + { + "epoch": 0.09372, + "grad_norm": 0.8106915641791952, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 9372 + }, + { + "epoch": 0.09373, + "grad_norm": 1.0420652181706669, + "learning_rate": 0.003, + "loss": 4.1327, + "step": 9373 + }, + { + "epoch": 0.09374, + "grad_norm": 0.9030870891200452, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 9374 + }, + { + "epoch": 0.09375, + "grad_norm": 0.6642846536487922, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 9375 + }, + { + "epoch": 0.09376, + "grad_norm": 0.5926013599625012, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 9376 + }, + { + "epoch": 0.09377, + "grad_norm": 0.7333731763197545, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 9377 + }, + { + "epoch": 0.09378, + "grad_norm": 0.7205911448453179, + "learning_rate": 0.003, + "loss": 4.098, + "step": 9378 + }, + { + "epoch": 0.09379, + "grad_norm": 0.6261002234170417, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 9379 + }, + { + "epoch": 0.0938, + "grad_norm": 0.5831220315530415, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 9380 + }, + { + "epoch": 0.09381, + "grad_norm": 0.6453696033833166, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 9381 + }, + { + "epoch": 0.09382, + "grad_norm": 0.7447834448687087, + "learning_rate": 0.003, + "loss": 4.09, + "step": 9382 + }, + { + "epoch": 0.09383, + "grad_norm": 0.8017088185500696, + "learning_rate": 0.003, + "loss": 4.077, + "step": 9383 + }, + { + "epoch": 0.09384, + "grad_norm": 0.749499774993177, + "learning_rate": 0.003, + "loss": 4.106, + "step": 9384 + }, + { + "epoch": 0.09385, + "grad_norm": 0.7694390492566549, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 9385 + }, + { + "epoch": 0.09386, + "grad_norm": 0.8555158072501666, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 9386 + }, + { + "epoch": 0.09387, + "grad_norm": 0.8776130218746526, + "learning_rate": 0.003, + "loss": 4.101, + "step": 9387 + }, + { + "epoch": 0.09388, + "grad_norm": 0.9970607046729825, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 9388 + }, + { + "epoch": 0.09389, + "grad_norm": 1.0116948164111348, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 9389 + }, + { + "epoch": 0.0939, + "grad_norm": 0.8607572473294256, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 9390 + }, + { + "epoch": 0.09391, + "grad_norm": 0.8141023279481875, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 9391 + }, + { + "epoch": 0.09392, + "grad_norm": 0.8221337642937177, + "learning_rate": 0.003, + "loss": 4.1228, + "step": 9392 + }, + { + "epoch": 0.09393, + "grad_norm": 0.8631165883014514, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 9393 + }, + { + "epoch": 0.09394, + "grad_norm": 0.8542446307614803, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 9394 + }, + { + "epoch": 0.09395, + "grad_norm": 0.7984540124391786, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 9395 + }, + { + "epoch": 0.09396, + "grad_norm": 0.833132121750485, + "learning_rate": 0.003, + "loss": 4.1332, + "step": 9396 + }, + { + "epoch": 0.09397, + "grad_norm": 0.9833442838193437, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 9397 + }, + { + "epoch": 0.09398, + "grad_norm": 1.003667576167635, + "learning_rate": 0.003, + "loss": 4.13, + "step": 9398 + }, + { + "epoch": 0.09399, + "grad_norm": 0.9559424149218053, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 9399 + }, + { + "epoch": 0.094, + "grad_norm": 0.9201692580630481, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 9400 + }, + { + "epoch": 0.09401, + "grad_norm": 0.8825427431804942, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 9401 + }, + { + "epoch": 0.09402, + "grad_norm": 0.7898577204069231, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 9402 + }, + { + "epoch": 0.09403, + "grad_norm": 0.7350505587815206, + "learning_rate": 0.003, + "loss": 4.1294, + "step": 9403 + }, + { + "epoch": 0.09404, + "grad_norm": 0.7291684122942435, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 9404 + }, + { + "epoch": 0.09405, + "grad_norm": 0.8162712076769146, + "learning_rate": 0.003, + "loss": 4.1242, + "step": 9405 + }, + { + "epoch": 0.09406, + "grad_norm": 0.9155173238132704, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 9406 + }, + { + "epoch": 0.09407, + "grad_norm": 0.8628871245046202, + "learning_rate": 0.003, + "loss": 4.107, + "step": 9407 + }, + { + "epoch": 0.09408, + "grad_norm": 0.7194018791800877, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 9408 + }, + { + "epoch": 0.09409, + "grad_norm": 0.7409924782459923, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 9409 + }, + { + "epoch": 0.0941, + "grad_norm": 0.7794190789815792, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 9410 + }, + { + "epoch": 0.09411, + "grad_norm": 0.8734670824027785, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 9411 + }, + { + "epoch": 0.09412, + "grad_norm": 0.8088836225010004, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 9412 + }, + { + "epoch": 0.09413, + "grad_norm": 0.7602993796261419, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 9413 + }, + { + "epoch": 0.09414, + "grad_norm": 0.6810778606987302, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 9414 + }, + { + "epoch": 0.09415, + "grad_norm": 0.6032263201656475, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 9415 + }, + { + "epoch": 0.09416, + "grad_norm": 0.4846341007249403, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 9416 + }, + { + "epoch": 0.09417, + "grad_norm": 0.5073097020665416, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 9417 + }, + { + "epoch": 0.09418, + "grad_norm": 0.466812325936142, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 9418 + }, + { + "epoch": 0.09419, + "grad_norm": 0.4909393054849047, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 9419 + }, + { + "epoch": 0.0942, + "grad_norm": 0.4572606419258861, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 9420 + }, + { + "epoch": 0.09421, + "grad_norm": 0.5236689638625828, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 9421 + }, + { + "epoch": 0.09422, + "grad_norm": 0.663258154130139, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 9422 + }, + { + "epoch": 0.09423, + "grad_norm": 0.928899803447629, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 9423 + }, + { + "epoch": 0.09424, + "grad_norm": 1.1926656295302425, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 9424 + }, + { + "epoch": 0.09425, + "grad_norm": 0.6853441865571991, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 9425 + }, + { + "epoch": 0.09426, + "grad_norm": 0.5128317339031602, + "learning_rate": 0.003, + "loss": 4.1094, + "step": 9426 + }, + { + "epoch": 0.09427, + "grad_norm": 0.7070587189992616, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 9427 + }, + { + "epoch": 0.09428, + "grad_norm": 0.773759955306798, + "learning_rate": 0.003, + "loss": 4.11, + "step": 9428 + }, + { + "epoch": 0.09429, + "grad_norm": 0.811718324620787, + "learning_rate": 0.003, + "loss": 4.08, + "step": 9429 + }, + { + "epoch": 0.0943, + "grad_norm": 0.7612959659265646, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 9430 + }, + { + "epoch": 0.09431, + "grad_norm": 0.705635244969301, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 9431 + }, + { + "epoch": 0.09432, + "grad_norm": 0.7185246612672866, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 9432 + }, + { + "epoch": 0.09433, + "grad_norm": 0.808636412731162, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 9433 + }, + { + "epoch": 0.09434, + "grad_norm": 0.8588858555452056, + "learning_rate": 0.003, + "loss": 4.073, + "step": 9434 + }, + { + "epoch": 0.09435, + "grad_norm": 0.8829432922785366, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 9435 + }, + { + "epoch": 0.09436, + "grad_norm": 0.8229332972229458, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 9436 + }, + { + "epoch": 0.09437, + "grad_norm": 0.7502237223787239, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 9437 + }, + { + "epoch": 0.09438, + "grad_norm": 0.6894340570648702, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 9438 + }, + { + "epoch": 0.09439, + "grad_norm": 0.7210313125158538, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 9439 + }, + { + "epoch": 0.0944, + "grad_norm": 0.7260397120709652, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 9440 + }, + { + "epoch": 0.09441, + "grad_norm": 0.7867602084595864, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 9441 + }, + { + "epoch": 0.09442, + "grad_norm": 0.8364820552569051, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 9442 + }, + { + "epoch": 0.09443, + "grad_norm": 0.9738365901239612, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 9443 + }, + { + "epoch": 0.09444, + "grad_norm": 1.162136240204578, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 9444 + }, + { + "epoch": 0.09445, + "grad_norm": 0.8068568465517928, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 9445 + }, + { + "epoch": 0.09446, + "grad_norm": 0.7524396274055184, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 9446 + }, + { + "epoch": 0.09447, + "grad_norm": 0.6473920221674216, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 9447 + }, + { + "epoch": 0.09448, + "grad_norm": 0.6745944542419227, + "learning_rate": 0.003, + "loss": 4.099, + "step": 9448 + }, + { + "epoch": 0.09449, + "grad_norm": 0.6655279612546542, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 9449 + }, + { + "epoch": 0.0945, + "grad_norm": 0.6127094330682528, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 9450 + }, + { + "epoch": 0.09451, + "grad_norm": 0.621361177849897, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 9451 + }, + { + "epoch": 0.09452, + "grad_norm": 0.6362015223505509, + "learning_rate": 0.003, + "loss": 4.103, + "step": 9452 + }, + { + "epoch": 0.09453, + "grad_norm": 0.813045169964411, + "learning_rate": 0.003, + "loss": 4.053, + "step": 9453 + }, + { + "epoch": 0.09454, + "grad_norm": 1.1347224007728371, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 9454 + }, + { + "epoch": 0.09455, + "grad_norm": 0.8959042657473294, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 9455 + }, + { + "epoch": 0.09456, + "grad_norm": 0.7685184606728515, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 9456 + }, + { + "epoch": 0.09457, + "grad_norm": 0.7342695223426569, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 9457 + }, + { + "epoch": 0.09458, + "grad_norm": 0.8272094526351071, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 9458 + }, + { + "epoch": 0.09459, + "grad_norm": 0.8913799983064666, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 9459 + }, + { + "epoch": 0.0946, + "grad_norm": 0.7914931766921125, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 9460 + }, + { + "epoch": 0.09461, + "grad_norm": 0.7143667270257789, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 9461 + }, + { + "epoch": 0.09462, + "grad_norm": 0.7155076180457024, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 9462 + }, + { + "epoch": 0.09463, + "grad_norm": 0.689792898507005, + "learning_rate": 0.003, + "loss": 4.078, + "step": 9463 + }, + { + "epoch": 0.09464, + "grad_norm": 0.7023738600071371, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 9464 + }, + { + "epoch": 0.09465, + "grad_norm": 0.7187786079383829, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 9465 + }, + { + "epoch": 0.09466, + "grad_norm": 0.6302140939859505, + "learning_rate": 0.003, + "loss": 4.095, + "step": 9466 + }, + { + "epoch": 0.09467, + "grad_norm": 0.612265626337836, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 9467 + }, + { + "epoch": 0.09468, + "grad_norm": 0.6579286983017547, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 9468 + }, + { + "epoch": 0.09469, + "grad_norm": 0.5830435072856022, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 9469 + }, + { + "epoch": 0.0947, + "grad_norm": 0.5258507863729508, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 9470 + }, + { + "epoch": 0.09471, + "grad_norm": 0.46919009653098687, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 9471 + }, + { + "epoch": 0.09472, + "grad_norm": 0.5215402592850111, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 9472 + }, + { + "epoch": 0.09473, + "grad_norm": 0.6491591524568011, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 9473 + }, + { + "epoch": 0.09474, + "grad_norm": 0.7868164547787738, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 9474 + }, + { + "epoch": 0.09475, + "grad_norm": 0.9666742348116686, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 9475 + }, + { + "epoch": 0.09476, + "grad_norm": 1.0668974359449654, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 9476 + }, + { + "epoch": 0.09477, + "grad_norm": 0.8895495837200137, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 9477 + }, + { + "epoch": 0.09478, + "grad_norm": 0.7822408243265004, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 9478 + }, + { + "epoch": 0.09479, + "grad_norm": 0.7362350598274618, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 9479 + }, + { + "epoch": 0.0948, + "grad_norm": 0.7635551325432101, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 9480 + }, + { + "epoch": 0.09481, + "grad_norm": 0.8359593042143941, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 9481 + }, + { + "epoch": 0.09482, + "grad_norm": 0.8458302540004683, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 9482 + }, + { + "epoch": 0.09483, + "grad_norm": 0.8340671653924622, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 9483 + }, + { + "epoch": 0.09484, + "grad_norm": 0.7992408585225695, + "learning_rate": 0.003, + "loss": 4.108, + "step": 9484 + }, + { + "epoch": 0.09485, + "grad_norm": 0.7964148073286885, + "learning_rate": 0.003, + "loss": 4.089, + "step": 9485 + }, + { + "epoch": 0.09486, + "grad_norm": 0.8771624728266194, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 9486 + }, + { + "epoch": 0.09487, + "grad_norm": 0.9914369833754603, + "learning_rate": 0.003, + "loss": 4.099, + "step": 9487 + }, + { + "epoch": 0.09488, + "grad_norm": 0.92799842755361, + "learning_rate": 0.003, + "loss": 4.1431, + "step": 9488 + }, + { + "epoch": 0.09489, + "grad_norm": 0.7239696176957987, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 9489 + }, + { + "epoch": 0.0949, + "grad_norm": 0.657621056297735, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 9490 + }, + { + "epoch": 0.09491, + "grad_norm": 0.5920365056277531, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 9491 + }, + { + "epoch": 0.09492, + "grad_norm": 0.6824599764976818, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 9492 + }, + { + "epoch": 0.09493, + "grad_norm": 0.8856580120578575, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 9493 + }, + { + "epoch": 0.09494, + "grad_norm": 0.9809438136763652, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 9494 + }, + { + "epoch": 0.09495, + "grad_norm": 1.0818725077217133, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 9495 + }, + { + "epoch": 0.09496, + "grad_norm": 0.8173342329103473, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 9496 + }, + { + "epoch": 0.09497, + "grad_norm": 0.6206180769695743, + "learning_rate": 0.003, + "loss": 4.089, + "step": 9497 + }, + { + "epoch": 0.09498, + "grad_norm": 0.5589853203174866, + "learning_rate": 0.003, + "loss": 4.067, + "step": 9498 + }, + { + "epoch": 0.09499, + "grad_norm": 0.5738351051872796, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 9499 + }, + { + "epoch": 0.095, + "grad_norm": 0.6306862899639631, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 9500 + }, + { + "epoch": 0.09501, + "grad_norm": 0.6733724428518848, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 9501 + }, + { + "epoch": 0.09502, + "grad_norm": 0.8461241073955719, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 9502 + }, + { + "epoch": 0.09503, + "grad_norm": 1.0158067722960233, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 9503 + }, + { + "epoch": 0.09504, + "grad_norm": 1.1190865130171455, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 9504 + }, + { + "epoch": 0.09505, + "grad_norm": 0.7932088206811884, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 9505 + }, + { + "epoch": 0.09506, + "grad_norm": 0.7129006027867597, + "learning_rate": 0.003, + "loss": 4.1104, + "step": 9506 + }, + { + "epoch": 0.09507, + "grad_norm": 0.6024715566098497, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 9507 + }, + { + "epoch": 0.09508, + "grad_norm": 0.6057657514430907, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 9508 + }, + { + "epoch": 0.09509, + "grad_norm": 0.6243112506876197, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 9509 + }, + { + "epoch": 0.0951, + "grad_norm": 0.6744699873109112, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 9510 + }, + { + "epoch": 0.09511, + "grad_norm": 0.5698773213648972, + "learning_rate": 0.003, + "loss": 4.077, + "step": 9511 + }, + { + "epoch": 0.09512, + "grad_norm": 0.5223468938440529, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 9512 + }, + { + "epoch": 0.09513, + "grad_norm": 0.5295241801515409, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 9513 + }, + { + "epoch": 0.09514, + "grad_norm": 0.6195032274853618, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 9514 + }, + { + "epoch": 0.09515, + "grad_norm": 0.7923024129571904, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 9515 + }, + { + "epoch": 0.09516, + "grad_norm": 1.0748459000652908, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 9516 + }, + { + "epoch": 0.09517, + "grad_norm": 0.9652150547579573, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 9517 + }, + { + "epoch": 0.09518, + "grad_norm": 0.9317902587041311, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 9518 + }, + { + "epoch": 0.09519, + "grad_norm": 1.0011920568954655, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 9519 + }, + { + "epoch": 0.0952, + "grad_norm": 0.8162413257986282, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 9520 + }, + { + "epoch": 0.09521, + "grad_norm": 0.8511270427905199, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 9521 + }, + { + "epoch": 0.09522, + "grad_norm": 0.869150690242155, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 9522 + }, + { + "epoch": 0.09523, + "grad_norm": 0.7851073631966403, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 9523 + }, + { + "epoch": 0.09524, + "grad_norm": 0.7805600991791303, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 9524 + }, + { + "epoch": 0.09525, + "grad_norm": 0.6704373220450912, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 9525 + }, + { + "epoch": 0.09526, + "grad_norm": 0.6560833308141075, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 9526 + }, + { + "epoch": 0.09527, + "grad_norm": 0.6885732713810039, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 9527 + }, + { + "epoch": 0.09528, + "grad_norm": 0.9403396043081385, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 9528 + }, + { + "epoch": 0.09529, + "grad_norm": 1.00556887324088, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 9529 + }, + { + "epoch": 0.0953, + "grad_norm": 0.888729666716305, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 9530 + }, + { + "epoch": 0.09531, + "grad_norm": 0.9418145584346838, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 9531 + }, + { + "epoch": 0.09532, + "grad_norm": 0.9872093380526542, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 9532 + }, + { + "epoch": 0.09533, + "grad_norm": 1.0096517466771673, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 9533 + }, + { + "epoch": 0.09534, + "grad_norm": 0.8896105143964483, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 9534 + }, + { + "epoch": 0.09535, + "grad_norm": 0.7675771159928804, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 9535 + }, + { + "epoch": 0.09536, + "grad_norm": 0.7682126422834153, + "learning_rate": 0.003, + "loss": 4.1164, + "step": 9536 + }, + { + "epoch": 0.09537, + "grad_norm": 0.8056030277957967, + "learning_rate": 0.003, + "loss": 4.1242, + "step": 9537 + }, + { + "epoch": 0.09538, + "grad_norm": 0.7162391293635292, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 9538 + }, + { + "epoch": 0.09539, + "grad_norm": 0.6683543592127842, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 9539 + }, + { + "epoch": 0.0954, + "grad_norm": 0.6001385230297466, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 9540 + }, + { + "epoch": 0.09541, + "grad_norm": 0.5520189491403364, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 9541 + }, + { + "epoch": 0.09542, + "grad_norm": 0.5771983745529633, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 9542 + }, + { + "epoch": 0.09543, + "grad_norm": 0.6070661140119326, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 9543 + }, + { + "epoch": 0.09544, + "grad_norm": 0.7384419185478289, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 9544 + }, + { + "epoch": 0.09545, + "grad_norm": 0.9657170883830423, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 9545 + }, + { + "epoch": 0.09546, + "grad_norm": 1.2681324157872516, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 9546 + }, + { + "epoch": 0.09547, + "grad_norm": 0.62049585194807, + "learning_rate": 0.003, + "loss": 4.094, + "step": 9547 + }, + { + "epoch": 0.09548, + "grad_norm": 0.795505396852106, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 9548 + }, + { + "epoch": 0.09549, + "grad_norm": 0.9041023642688241, + "learning_rate": 0.003, + "loss": 4.129, + "step": 9549 + }, + { + "epoch": 0.0955, + "grad_norm": 0.7920466015961556, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 9550 + }, + { + "epoch": 0.09551, + "grad_norm": 0.7664489290961656, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 9551 + }, + { + "epoch": 0.09552, + "grad_norm": 0.7297971620268919, + "learning_rate": 0.003, + "loss": 4.1432, + "step": 9552 + }, + { + "epoch": 0.09553, + "grad_norm": 0.7231956245433028, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 9553 + }, + { + "epoch": 0.09554, + "grad_norm": 0.7096741017462413, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 9554 + }, + { + "epoch": 0.09555, + "grad_norm": 0.7188180696481433, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 9555 + }, + { + "epoch": 0.09556, + "grad_norm": 0.700959357888011, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 9556 + }, + { + "epoch": 0.09557, + "grad_norm": 0.6879426371558562, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 9557 + }, + { + "epoch": 0.09558, + "grad_norm": 0.6597031957541576, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 9558 + }, + { + "epoch": 0.09559, + "grad_norm": 0.6739137082114623, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 9559 + }, + { + "epoch": 0.0956, + "grad_norm": 0.6964119348231613, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 9560 + }, + { + "epoch": 0.09561, + "grad_norm": 0.7031263209660419, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 9561 + }, + { + "epoch": 0.09562, + "grad_norm": 0.9420485552641903, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 9562 + }, + { + "epoch": 0.09563, + "grad_norm": 6.130969416922354, + "learning_rate": 0.003, + "loss": 4.293, + "step": 9563 + }, + { + "epoch": 0.09564, + "grad_norm": 1.309865365129671, + "learning_rate": 0.003, + "loss": 4.1751, + "step": 9564 + }, + { + "epoch": 0.09565, + "grad_norm": 1.0521711710301151, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 9565 + }, + { + "epoch": 0.09566, + "grad_norm": 0.9997826713731996, + "learning_rate": 0.003, + "loss": 4.1481, + "step": 9566 + }, + { + "epoch": 0.09567, + "grad_norm": 1.3487618062811157, + "learning_rate": 0.003, + "loss": 4.1547, + "step": 9567 + }, + { + "epoch": 0.09568, + "grad_norm": 0.9741842634354936, + "learning_rate": 0.003, + "loss": 4.1462, + "step": 9568 + }, + { + "epoch": 0.09569, + "grad_norm": 1.0654371346008946, + "learning_rate": 0.003, + "loss": 4.1583, + "step": 9569 + }, + { + "epoch": 0.0957, + "grad_norm": 0.8712382456617402, + "learning_rate": 0.003, + "loss": 4.15, + "step": 9570 + }, + { + "epoch": 0.09571, + "grad_norm": 0.8849374832653227, + "learning_rate": 0.003, + "loss": 4.1475, + "step": 9571 + }, + { + "epoch": 0.09572, + "grad_norm": 1.0577800121636243, + "learning_rate": 0.003, + "loss": 4.1463, + "step": 9572 + }, + { + "epoch": 0.09573, + "grad_norm": 1.206598624557028, + "learning_rate": 0.003, + "loss": 4.1599, + "step": 9573 + }, + { + "epoch": 0.09574, + "grad_norm": 1.2512634504121267, + "learning_rate": 0.003, + "loss": 4.1668, + "step": 9574 + }, + { + "epoch": 0.09575, + "grad_norm": 0.9501656930930779, + "learning_rate": 0.003, + "loss": 4.1379, + "step": 9575 + }, + { + "epoch": 0.09576, + "grad_norm": 1.1728879533932677, + "learning_rate": 0.003, + "loss": 4.1477, + "step": 9576 + }, + { + "epoch": 0.09577, + "grad_norm": 2.0768187878146875, + "learning_rate": 0.003, + "loss": 4.184, + "step": 9577 + }, + { + "epoch": 0.09578, + "grad_norm": 1.105305608187662, + "learning_rate": 0.003, + "loss": 4.1486, + "step": 9578 + }, + { + "epoch": 0.09579, + "grad_norm": 0.9689648072779388, + "learning_rate": 0.003, + "loss": 4.1449, + "step": 9579 + }, + { + "epoch": 0.0958, + "grad_norm": 0.9360186586846221, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 9580 + }, + { + "epoch": 0.09581, + "grad_norm": 1.0800383998211391, + "learning_rate": 0.003, + "loss": 4.2082, + "step": 9581 + }, + { + "epoch": 0.09582, + "grad_norm": 1.0704867786853698, + "learning_rate": 0.003, + "loss": 4.1485, + "step": 9582 + }, + { + "epoch": 0.09583, + "grad_norm": 1.142877500779312, + "learning_rate": 0.003, + "loss": 4.1432, + "step": 9583 + }, + { + "epoch": 0.09584, + "grad_norm": 1.3332645879072875, + "learning_rate": 0.003, + "loss": 4.1696, + "step": 9584 + }, + { + "epoch": 0.09585, + "grad_norm": 1.5324381756718481, + "learning_rate": 0.003, + "loss": 4.1664, + "step": 9585 + }, + { + "epoch": 0.09586, + "grad_norm": 0.8704649822421158, + "learning_rate": 0.003, + "loss": 4.1747, + "step": 9586 + }, + { + "epoch": 0.09587, + "grad_norm": 0.9688340079251864, + "learning_rate": 0.003, + "loss": 4.126, + "step": 9587 + }, + { + "epoch": 0.09588, + "grad_norm": 0.8342771328110887, + "learning_rate": 0.003, + "loss": 4.1531, + "step": 9588 + }, + { + "epoch": 0.09589, + "grad_norm": 0.7942887994349634, + "learning_rate": 0.003, + "loss": 4.153, + "step": 9589 + }, + { + "epoch": 0.0959, + "grad_norm": 0.8173041322090617, + "learning_rate": 0.003, + "loss": 4.1609, + "step": 9590 + }, + { + "epoch": 0.09591, + "grad_norm": 0.7677670828705769, + "learning_rate": 0.003, + "loss": 4.1874, + "step": 9591 + }, + { + "epoch": 0.09592, + "grad_norm": 0.7200915670319258, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 9592 + }, + { + "epoch": 0.09593, + "grad_norm": 0.7972359349347333, + "learning_rate": 0.003, + "loss": 4.1567, + "step": 9593 + }, + { + "epoch": 0.09594, + "grad_norm": 0.8556890271092606, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 9594 + }, + { + "epoch": 0.09595, + "grad_norm": 0.8915728396380502, + "learning_rate": 0.003, + "loss": 4.1244, + "step": 9595 + }, + { + "epoch": 0.09596, + "grad_norm": 0.9347507452393141, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 9596 + }, + { + "epoch": 0.09597, + "grad_norm": 1.0024508636239098, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 9597 + }, + { + "epoch": 0.09598, + "grad_norm": 1.2169763273515621, + "learning_rate": 0.003, + "loss": 4.1545, + "step": 9598 + }, + { + "epoch": 0.09599, + "grad_norm": 0.7115963030817509, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 9599 + }, + { + "epoch": 0.096, + "grad_norm": 0.6328045403895035, + "learning_rate": 0.003, + "loss": 4.1151, + "step": 9600 + }, + { + "epoch": 0.09601, + "grad_norm": 0.5998687509571087, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 9601 + }, + { + "epoch": 0.09602, + "grad_norm": 0.5466887961482548, + "learning_rate": 0.003, + "loss": 4.1438, + "step": 9602 + }, + { + "epoch": 0.09603, + "grad_norm": 0.7860448470509064, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 9603 + }, + { + "epoch": 0.09604, + "grad_norm": 0.8745624847605153, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 9604 + }, + { + "epoch": 0.09605, + "grad_norm": 0.8688163043711737, + "learning_rate": 0.003, + "loss": 4.1279, + "step": 9605 + }, + { + "epoch": 0.09606, + "grad_norm": 0.7963417759657033, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 9606 + }, + { + "epoch": 0.09607, + "grad_norm": 0.7369028440038995, + "learning_rate": 0.003, + "loss": 4.1318, + "step": 9607 + }, + { + "epoch": 0.09608, + "grad_norm": 0.8647736584876029, + "learning_rate": 0.003, + "loss": 4.1427, + "step": 9608 + }, + { + "epoch": 0.09609, + "grad_norm": 1.0027577187055126, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 9609 + }, + { + "epoch": 0.0961, + "grad_norm": 0.8626178763464114, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 9610 + }, + { + "epoch": 0.09611, + "grad_norm": 0.5910337185914825, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 9611 + }, + { + "epoch": 0.09612, + "grad_norm": 0.49438202417856203, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 9612 + }, + { + "epoch": 0.09613, + "grad_norm": 0.484763634619797, + "learning_rate": 0.003, + "loss": 4.1161, + "step": 9613 + }, + { + "epoch": 0.09614, + "grad_norm": 0.44675304497501767, + "learning_rate": 0.003, + "loss": 4.103, + "step": 9614 + }, + { + "epoch": 0.09615, + "grad_norm": 0.3830623042239131, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 9615 + }, + { + "epoch": 0.09616, + "grad_norm": 0.40447953537590603, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 9616 + }, + { + "epoch": 0.09617, + "grad_norm": 0.42749479774334626, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 9617 + }, + { + "epoch": 0.09618, + "grad_norm": 0.3807451005616551, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 9618 + }, + { + "epoch": 0.09619, + "grad_norm": 0.3715423418211784, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 9619 + }, + { + "epoch": 0.0962, + "grad_norm": 0.3425603111991135, + "learning_rate": 0.003, + "loss": 4.074, + "step": 9620 + }, + { + "epoch": 0.09621, + "grad_norm": 0.3507562163489838, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 9621 + }, + { + "epoch": 0.09622, + "grad_norm": 0.29748679852114185, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 9622 + }, + { + "epoch": 0.09623, + "grad_norm": 0.2839394173155373, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 9623 + }, + { + "epoch": 0.09624, + "grad_norm": 0.2794661039316224, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 9624 + }, + { + "epoch": 0.09625, + "grad_norm": 0.29629458143975185, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 9625 + }, + { + "epoch": 0.09626, + "grad_norm": 0.3292412126393909, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 9626 + }, + { + "epoch": 0.09627, + "grad_norm": 0.4497439282959039, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 9627 + }, + { + "epoch": 0.09628, + "grad_norm": 0.6526850258400877, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 9628 + }, + { + "epoch": 0.09629, + "grad_norm": 0.9634340689907828, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 9629 + }, + { + "epoch": 0.0963, + "grad_norm": 1.1516478385031867, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 9630 + }, + { + "epoch": 0.09631, + "grad_norm": 0.6401093793721576, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 9631 + }, + { + "epoch": 0.09632, + "grad_norm": 0.7636314345691994, + "learning_rate": 0.003, + "loss": 4.059, + "step": 9632 + }, + { + "epoch": 0.09633, + "grad_norm": 0.992571157461324, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 9633 + }, + { + "epoch": 0.09634, + "grad_norm": 0.7463855736399202, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 9634 + }, + { + "epoch": 0.09635, + "grad_norm": 0.6909299649125988, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 9635 + }, + { + "epoch": 0.09636, + "grad_norm": 0.7431003319653801, + "learning_rate": 0.003, + "loss": 4.109, + "step": 9636 + }, + { + "epoch": 0.09637, + "grad_norm": 0.6455512496885061, + "learning_rate": 0.003, + "loss": 4.075, + "step": 9637 + }, + { + "epoch": 0.09638, + "grad_norm": 0.5445783213282376, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 9638 + }, + { + "epoch": 0.09639, + "grad_norm": 0.5613727528028151, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 9639 + }, + { + "epoch": 0.0964, + "grad_norm": 0.7244593336080651, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 9640 + }, + { + "epoch": 0.09641, + "grad_norm": 0.9030660564578993, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 9641 + }, + { + "epoch": 0.09642, + "grad_norm": 0.9887019736303208, + "learning_rate": 0.003, + "loss": 4.1241, + "step": 9642 + }, + { + "epoch": 0.09643, + "grad_norm": 0.8838383160348036, + "learning_rate": 0.003, + "loss": 4.092, + "step": 9643 + }, + { + "epoch": 0.09644, + "grad_norm": 0.7515245578827098, + "learning_rate": 0.003, + "loss": 4.1104, + "step": 9644 + }, + { + "epoch": 0.09645, + "grad_norm": 0.6649872915216315, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 9645 + }, + { + "epoch": 0.09646, + "grad_norm": 0.7116890218147718, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 9646 + }, + { + "epoch": 0.09647, + "grad_norm": 0.7019694940498846, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 9647 + }, + { + "epoch": 0.09648, + "grad_norm": 0.6596706388975605, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 9648 + }, + { + "epoch": 0.09649, + "grad_norm": 0.667100870547891, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 9649 + }, + { + "epoch": 0.0965, + "grad_norm": 0.6563616277238821, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 9650 + }, + { + "epoch": 0.09651, + "grad_norm": 0.6207032309420466, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 9651 + }, + { + "epoch": 0.09652, + "grad_norm": 0.6164765321685511, + "learning_rate": 0.003, + "loss": 4.08, + "step": 9652 + }, + { + "epoch": 0.09653, + "grad_norm": 0.6425858801124605, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 9653 + }, + { + "epoch": 0.09654, + "grad_norm": 0.6719434091175973, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 9654 + }, + { + "epoch": 0.09655, + "grad_norm": 0.6816595879015243, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 9655 + }, + { + "epoch": 0.09656, + "grad_norm": 0.75276222897346, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 9656 + }, + { + "epoch": 0.09657, + "grad_norm": 0.793810434998781, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 9657 + }, + { + "epoch": 0.09658, + "grad_norm": 0.8203985544035876, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 9658 + }, + { + "epoch": 0.09659, + "grad_norm": 0.7228109813703856, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 9659 + }, + { + "epoch": 0.0966, + "grad_norm": 0.6284036142391546, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 9660 + }, + { + "epoch": 0.09661, + "grad_norm": 0.5390823749375145, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 9661 + }, + { + "epoch": 0.09662, + "grad_norm": 0.4937468666738386, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 9662 + }, + { + "epoch": 0.09663, + "grad_norm": 0.5267363875644104, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 9663 + }, + { + "epoch": 0.09664, + "grad_norm": 0.5614869173293953, + "learning_rate": 0.003, + "loss": 4.094, + "step": 9664 + }, + { + "epoch": 0.09665, + "grad_norm": 0.6434988568327966, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 9665 + }, + { + "epoch": 0.09666, + "grad_norm": 0.6863099786692454, + "learning_rate": 0.003, + "loss": 4.091, + "step": 9666 + }, + { + "epoch": 0.09667, + "grad_norm": 0.6974192050812339, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 9667 + }, + { + "epoch": 0.09668, + "grad_norm": 0.7130895485506055, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 9668 + }, + { + "epoch": 0.09669, + "grad_norm": 0.7314830324385385, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 9669 + }, + { + "epoch": 0.0967, + "grad_norm": 0.7358599798360544, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 9670 + }, + { + "epoch": 0.09671, + "grad_norm": 0.6821836408473856, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 9671 + }, + { + "epoch": 0.09672, + "grad_norm": 0.6652927918131237, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 9672 + }, + { + "epoch": 0.09673, + "grad_norm": 0.7634068597168817, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 9673 + }, + { + "epoch": 0.09674, + "grad_norm": 0.7974126591485172, + "learning_rate": 0.003, + "loss": 4.1387, + "step": 9674 + }, + { + "epoch": 0.09675, + "grad_norm": 0.8544438788166088, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 9675 + }, + { + "epoch": 0.09676, + "grad_norm": 0.7941264785151647, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 9676 + }, + { + "epoch": 0.09677, + "grad_norm": 0.7203225810853267, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 9677 + }, + { + "epoch": 0.09678, + "grad_norm": 0.5740375147446136, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 9678 + }, + { + "epoch": 0.09679, + "grad_norm": 0.60439656922455, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 9679 + }, + { + "epoch": 0.0968, + "grad_norm": 0.7001578420854571, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 9680 + }, + { + "epoch": 0.09681, + "grad_norm": 0.8530019659681068, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 9681 + }, + { + "epoch": 0.09682, + "grad_norm": 1.0076727467524107, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 9682 + }, + { + "epoch": 0.09683, + "grad_norm": 1.1399405747494864, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 9683 + }, + { + "epoch": 0.09684, + "grad_norm": 0.6866617378578579, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 9684 + }, + { + "epoch": 0.09685, + "grad_norm": 0.6034395427034922, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 9685 + }, + { + "epoch": 0.09686, + "grad_norm": 0.6858126543232959, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 9686 + }, + { + "epoch": 0.09687, + "grad_norm": 0.7606273901161237, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 9687 + }, + { + "epoch": 0.09688, + "grad_norm": 0.7823206051561976, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 9688 + }, + { + "epoch": 0.09689, + "grad_norm": 0.7275273786792908, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 9689 + }, + { + "epoch": 0.0969, + "grad_norm": 0.7020173666361166, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 9690 + }, + { + "epoch": 0.09691, + "grad_norm": 0.668700670688743, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 9691 + }, + { + "epoch": 0.09692, + "grad_norm": 0.7183283334951155, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 9692 + }, + { + "epoch": 0.09693, + "grad_norm": 0.8228261565164465, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 9693 + }, + { + "epoch": 0.09694, + "grad_norm": 0.9398860669380985, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 9694 + }, + { + "epoch": 0.09695, + "grad_norm": 0.9679499359904971, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 9695 + }, + { + "epoch": 0.09696, + "grad_norm": 0.9831277733673565, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 9696 + }, + { + "epoch": 0.09697, + "grad_norm": 0.8922294143645066, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 9697 + }, + { + "epoch": 0.09698, + "grad_norm": 0.8622941183892853, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 9698 + }, + { + "epoch": 0.09699, + "grad_norm": 0.849954807407499, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 9699 + }, + { + "epoch": 0.097, + "grad_norm": 0.8595604678823553, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 9700 + }, + { + "epoch": 0.09701, + "grad_norm": 0.7378223203944358, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 9701 + }, + { + "epoch": 0.09702, + "grad_norm": 0.6131607408869036, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 9702 + }, + { + "epoch": 0.09703, + "grad_norm": 0.5814032117502681, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 9703 + }, + { + "epoch": 0.09704, + "grad_norm": 0.6207822778251723, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 9704 + }, + { + "epoch": 0.09705, + "grad_norm": 0.5424790257052854, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 9705 + }, + { + "epoch": 0.09706, + "grad_norm": 0.5191042976935801, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 9706 + }, + { + "epoch": 0.09707, + "grad_norm": 0.4825162871461943, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 9707 + }, + { + "epoch": 0.09708, + "grad_norm": 0.5254613203667191, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 9708 + }, + { + "epoch": 0.09709, + "grad_norm": 0.5837214908960893, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 9709 + }, + { + "epoch": 0.0971, + "grad_norm": 0.5919660600745414, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 9710 + }, + { + "epoch": 0.09711, + "grad_norm": 0.5985906886004138, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 9711 + }, + { + "epoch": 0.09712, + "grad_norm": 0.6218596201649731, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 9712 + }, + { + "epoch": 0.09713, + "grad_norm": 0.6609701796902929, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 9713 + }, + { + "epoch": 0.09714, + "grad_norm": 0.5993265445219289, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 9714 + }, + { + "epoch": 0.09715, + "grad_norm": 0.5595560077596979, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 9715 + }, + { + "epoch": 0.09716, + "grad_norm": 0.5458771794387989, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 9716 + }, + { + "epoch": 0.09717, + "grad_norm": 0.5749644308670316, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 9717 + }, + { + "epoch": 0.09718, + "grad_norm": 0.7136367880167833, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 9718 + }, + { + "epoch": 0.09719, + "grad_norm": 0.8710577217664088, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 9719 + }, + { + "epoch": 0.0972, + "grad_norm": 1.1290953541399391, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 9720 + }, + { + "epoch": 0.09721, + "grad_norm": 0.9640405697464676, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 9721 + }, + { + "epoch": 0.09722, + "grad_norm": 0.947033121500074, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 9722 + }, + { + "epoch": 0.09723, + "grad_norm": 0.864058303867878, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 9723 + }, + { + "epoch": 0.09724, + "grad_norm": 0.7910749200085015, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 9724 + }, + { + "epoch": 0.09725, + "grad_norm": 0.6706273678022362, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 9725 + }, + { + "epoch": 0.09726, + "grad_norm": 0.6952803945114413, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 9726 + }, + { + "epoch": 0.09727, + "grad_norm": 0.7730372856532575, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 9727 + }, + { + "epoch": 0.09728, + "grad_norm": 0.6776321846761416, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 9728 + }, + { + "epoch": 0.09729, + "grad_norm": 0.656555599450724, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 9729 + }, + { + "epoch": 0.0973, + "grad_norm": 0.7397413425845409, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 9730 + }, + { + "epoch": 0.09731, + "grad_norm": 0.741668722027564, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 9731 + }, + { + "epoch": 0.09732, + "grad_norm": 0.9192765996179019, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 9732 + }, + { + "epoch": 0.09733, + "grad_norm": 0.9726160313328494, + "learning_rate": 0.003, + "loss": 4.095, + "step": 9733 + }, + { + "epoch": 0.09734, + "grad_norm": 0.8089628303200062, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 9734 + }, + { + "epoch": 0.09735, + "grad_norm": 0.9196854053701035, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 9735 + }, + { + "epoch": 0.09736, + "grad_norm": 1.0355781723270967, + "learning_rate": 0.003, + "loss": 4.094, + "step": 9736 + }, + { + "epoch": 0.09737, + "grad_norm": 1.390013473224088, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 9737 + }, + { + "epoch": 0.09738, + "grad_norm": 0.6677058435549194, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 9738 + }, + { + "epoch": 0.09739, + "grad_norm": 0.7149231696043877, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 9739 + }, + { + "epoch": 0.0974, + "grad_norm": 0.7596898377209321, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 9740 + }, + { + "epoch": 0.09741, + "grad_norm": 0.9445678693533357, + "learning_rate": 0.003, + "loss": 4.1459, + "step": 9741 + }, + { + "epoch": 0.09742, + "grad_norm": 1.144761757205563, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 9742 + }, + { + "epoch": 0.09743, + "grad_norm": 0.7798904071068536, + "learning_rate": 0.003, + "loss": 4.11, + "step": 9743 + }, + { + "epoch": 0.09744, + "grad_norm": 0.7126950822384531, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 9744 + }, + { + "epoch": 0.09745, + "grad_norm": 0.804932675736049, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 9745 + }, + { + "epoch": 0.09746, + "grad_norm": 0.8275041917630538, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 9746 + }, + { + "epoch": 0.09747, + "grad_norm": 0.8654165216744595, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 9747 + }, + { + "epoch": 0.09748, + "grad_norm": 0.8483082244535185, + "learning_rate": 0.003, + "loss": 4.1333, + "step": 9748 + }, + { + "epoch": 0.09749, + "grad_norm": 0.9655796883096281, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 9749 + }, + { + "epoch": 0.0975, + "grad_norm": 0.8375150794577596, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 9750 + }, + { + "epoch": 0.09751, + "grad_norm": 0.8357319865847818, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 9751 + }, + { + "epoch": 0.09752, + "grad_norm": 0.7805475045931051, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 9752 + }, + { + "epoch": 0.09753, + "grad_norm": 0.7014801049003869, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 9753 + }, + { + "epoch": 0.09754, + "grad_norm": 0.6856528434863608, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 9754 + }, + { + "epoch": 0.09755, + "grad_norm": 0.650848045634542, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 9755 + }, + { + "epoch": 0.09756, + "grad_norm": 0.5622036395323345, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 9756 + }, + { + "epoch": 0.09757, + "grad_norm": 0.5679917813675583, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 9757 + }, + { + "epoch": 0.09758, + "grad_norm": 0.5576435578489023, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 9758 + }, + { + "epoch": 0.09759, + "grad_norm": 0.6963321867164363, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 9759 + }, + { + "epoch": 0.0976, + "grad_norm": 0.7838079745448902, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 9760 + }, + { + "epoch": 0.09761, + "grad_norm": 0.9541884510786587, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 9761 + }, + { + "epoch": 0.09762, + "grad_norm": 1.121153313260097, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 9762 + }, + { + "epoch": 0.09763, + "grad_norm": 0.6664939878081045, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 9763 + }, + { + "epoch": 0.09764, + "grad_norm": 0.6067299530309102, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 9764 + }, + { + "epoch": 0.09765, + "grad_norm": 0.6916937760369664, + "learning_rate": 0.003, + "loss": 4.1132, + "step": 9765 + }, + { + "epoch": 0.09766, + "grad_norm": 0.7187195565497518, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 9766 + }, + { + "epoch": 0.09767, + "grad_norm": 0.7297554741322773, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 9767 + }, + { + "epoch": 0.09768, + "grad_norm": 0.7197047154946628, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 9768 + }, + { + "epoch": 0.09769, + "grad_norm": 0.77558338050166, + "learning_rate": 0.003, + "loss": 4.107, + "step": 9769 + }, + { + "epoch": 0.0977, + "grad_norm": 0.776706503347036, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 9770 + }, + { + "epoch": 0.09771, + "grad_norm": 0.6629745622875617, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 9771 + }, + { + "epoch": 0.09772, + "grad_norm": 0.599070691762346, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 9772 + }, + { + "epoch": 0.09773, + "grad_norm": 0.6430974616537984, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 9773 + }, + { + "epoch": 0.09774, + "grad_norm": 0.65829695633147, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 9774 + }, + { + "epoch": 0.09775, + "grad_norm": 0.6436886424967752, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 9775 + }, + { + "epoch": 0.09776, + "grad_norm": 0.7272224412585245, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 9776 + }, + { + "epoch": 0.09777, + "grad_norm": 0.7810004465966248, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 9777 + }, + { + "epoch": 0.09778, + "grad_norm": 0.843156271360344, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 9778 + }, + { + "epoch": 0.09779, + "grad_norm": 0.8542309059649245, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 9779 + }, + { + "epoch": 0.0978, + "grad_norm": 0.8284073938545176, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 9780 + }, + { + "epoch": 0.09781, + "grad_norm": 0.7656977589639479, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 9781 + }, + { + "epoch": 0.09782, + "grad_norm": 0.642867591138632, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 9782 + }, + { + "epoch": 0.09783, + "grad_norm": 0.6315054401796648, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 9783 + }, + { + "epoch": 0.09784, + "grad_norm": 0.6506631017065446, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 9784 + }, + { + "epoch": 0.09785, + "grad_norm": 0.6870713649494674, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 9785 + }, + { + "epoch": 0.09786, + "grad_norm": 0.7447419206648687, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 9786 + }, + { + "epoch": 0.09787, + "grad_norm": 0.8992521452563863, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 9787 + }, + { + "epoch": 0.09788, + "grad_norm": 0.8733619920631194, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 9788 + }, + { + "epoch": 0.09789, + "grad_norm": 0.829358760588995, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 9789 + }, + { + "epoch": 0.0979, + "grad_norm": 0.8166018992244876, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 9790 + }, + { + "epoch": 0.09791, + "grad_norm": 0.8700145284441482, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 9791 + }, + { + "epoch": 0.09792, + "grad_norm": 0.9672422465296193, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 9792 + }, + { + "epoch": 0.09793, + "grad_norm": 0.8861730905617189, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 9793 + }, + { + "epoch": 0.09794, + "grad_norm": 0.8248882883309061, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 9794 + }, + { + "epoch": 0.09795, + "grad_norm": 0.7088842143033923, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 9795 + }, + { + "epoch": 0.09796, + "grad_norm": 0.7130954093466139, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 9796 + }, + { + "epoch": 0.09797, + "grad_norm": 0.7288682239466572, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 9797 + }, + { + "epoch": 0.09798, + "grad_norm": 0.7976440093808327, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 9798 + }, + { + "epoch": 0.09799, + "grad_norm": 0.9649496398862366, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 9799 + }, + { + "epoch": 0.098, + "grad_norm": 1.1331720133396581, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 9800 + }, + { + "epoch": 0.09801, + "grad_norm": 0.7034672135712403, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 9801 + }, + { + "epoch": 0.09802, + "grad_norm": 0.6773839521714846, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 9802 + }, + { + "epoch": 0.09803, + "grad_norm": 0.7355150714696657, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 9803 + }, + { + "epoch": 0.09804, + "grad_norm": 0.7174536168973166, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 9804 + }, + { + "epoch": 0.09805, + "grad_norm": 0.6507571892133279, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 9805 + }, + { + "epoch": 0.09806, + "grad_norm": 0.64602677559084, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 9806 + }, + { + "epoch": 0.09807, + "grad_norm": 0.7072086988189831, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 9807 + }, + { + "epoch": 0.09808, + "grad_norm": 0.6694715352849688, + "learning_rate": 0.003, + "loss": 4.1121, + "step": 9808 + }, + { + "epoch": 0.09809, + "grad_norm": 0.641647359947315, + "learning_rate": 0.003, + "loss": 4.077, + "step": 9809 + }, + { + "epoch": 0.0981, + "grad_norm": 0.6231401766200729, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 9810 + }, + { + "epoch": 0.09811, + "grad_norm": 0.6349700167139162, + "learning_rate": 0.003, + "loss": 4.104, + "step": 9811 + }, + { + "epoch": 0.09812, + "grad_norm": 0.6948447910573521, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 9812 + }, + { + "epoch": 0.09813, + "grad_norm": 0.8393980996176053, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 9813 + }, + { + "epoch": 0.09814, + "grad_norm": 0.9318454756189145, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 9814 + }, + { + "epoch": 0.09815, + "grad_norm": 0.9599372181289262, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 9815 + }, + { + "epoch": 0.09816, + "grad_norm": 0.8956701047579604, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 9816 + }, + { + "epoch": 0.09817, + "grad_norm": 0.7842843597137388, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 9817 + }, + { + "epoch": 0.09818, + "grad_norm": 0.6893223770225535, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 9818 + }, + { + "epoch": 0.09819, + "grad_norm": 0.5764089558777323, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 9819 + }, + { + "epoch": 0.0982, + "grad_norm": 0.6831187750693034, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 9820 + }, + { + "epoch": 0.09821, + "grad_norm": 0.7420126000497227, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 9821 + }, + { + "epoch": 0.09822, + "grad_norm": 0.7918367231819561, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 9822 + }, + { + "epoch": 0.09823, + "grad_norm": 0.8307403908166644, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 9823 + }, + { + "epoch": 0.09824, + "grad_norm": 0.8384902251512314, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 9824 + }, + { + "epoch": 0.09825, + "grad_norm": 0.9488542492159057, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 9825 + }, + { + "epoch": 0.09826, + "grad_norm": 0.9814885356015456, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 9826 + }, + { + "epoch": 0.09827, + "grad_norm": 1.0404257496474245, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 9827 + }, + { + "epoch": 0.09828, + "grad_norm": 0.9374789182838859, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 9828 + }, + { + "epoch": 0.09829, + "grad_norm": 0.8352504231117999, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 9829 + }, + { + "epoch": 0.0983, + "grad_norm": 0.8712005573355397, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 9830 + }, + { + "epoch": 0.09831, + "grad_norm": 0.8221414372597472, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 9831 + }, + { + "epoch": 0.09832, + "grad_norm": 0.7097651197485733, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 9832 + }, + { + "epoch": 0.09833, + "grad_norm": 0.7171718585045, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 9833 + }, + { + "epoch": 0.09834, + "grad_norm": 0.7926851111043537, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 9834 + }, + { + "epoch": 0.09835, + "grad_norm": 0.8623151532060463, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 9835 + }, + { + "epoch": 0.09836, + "grad_norm": 0.8199248380589881, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 9836 + }, + { + "epoch": 0.09837, + "grad_norm": 0.7755743905810927, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 9837 + }, + { + "epoch": 0.09838, + "grad_norm": 0.7705706750252852, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 9838 + }, + { + "epoch": 0.09839, + "grad_norm": 0.845394429517052, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 9839 + }, + { + "epoch": 0.0984, + "grad_norm": 0.9405467332911781, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 9840 + }, + { + "epoch": 0.09841, + "grad_norm": 0.8405964634828268, + "learning_rate": 0.003, + "loss": 4.111, + "step": 9841 + }, + { + "epoch": 0.09842, + "grad_norm": 0.7811416765302842, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 9842 + }, + { + "epoch": 0.09843, + "grad_norm": 0.8180023616159704, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 9843 + }, + { + "epoch": 0.09844, + "grad_norm": 0.7527374804721993, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 9844 + }, + { + "epoch": 0.09845, + "grad_norm": 0.7530390141763947, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 9845 + }, + { + "epoch": 0.09846, + "grad_norm": 0.8899981921507195, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 9846 + }, + { + "epoch": 0.09847, + "grad_norm": 1.1111905360253278, + "learning_rate": 0.003, + "loss": 4.105, + "step": 9847 + }, + { + "epoch": 0.09848, + "grad_norm": 0.8799351792098972, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 9848 + }, + { + "epoch": 0.09849, + "grad_norm": 0.6714966433483521, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 9849 + }, + { + "epoch": 0.0985, + "grad_norm": 0.660208855694013, + "learning_rate": 0.003, + "loss": 4.112, + "step": 9850 + }, + { + "epoch": 0.09851, + "grad_norm": 0.7037816807855953, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 9851 + }, + { + "epoch": 0.09852, + "grad_norm": 0.7854724491123188, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 9852 + }, + { + "epoch": 0.09853, + "grad_norm": 0.8934963259953501, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 9853 + }, + { + "epoch": 0.09854, + "grad_norm": 0.8084018134106394, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 9854 + }, + { + "epoch": 0.09855, + "grad_norm": 0.795443981883104, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 9855 + }, + { + "epoch": 0.09856, + "grad_norm": 0.7281963252673843, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 9856 + }, + { + "epoch": 0.09857, + "grad_norm": 0.7182745458303548, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 9857 + }, + { + "epoch": 0.09858, + "grad_norm": 0.7050569720124196, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 9858 + }, + { + "epoch": 0.09859, + "grad_norm": 0.6807473261285026, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 9859 + }, + { + "epoch": 0.0986, + "grad_norm": 0.6746756830722065, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 9860 + }, + { + "epoch": 0.09861, + "grad_norm": 0.7007907511364769, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 9861 + }, + { + "epoch": 0.09862, + "grad_norm": 0.7927126135369934, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 9862 + }, + { + "epoch": 0.09863, + "grad_norm": 0.8386321721308743, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 9863 + }, + { + "epoch": 0.09864, + "grad_norm": 0.7507936235349171, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 9864 + }, + { + "epoch": 0.09865, + "grad_norm": 0.6481226992818734, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 9865 + }, + { + "epoch": 0.09866, + "grad_norm": 0.5697115094410177, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 9866 + }, + { + "epoch": 0.09867, + "grad_norm": 0.5126449464627183, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 9867 + }, + { + "epoch": 0.09868, + "grad_norm": 0.5102549762756471, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 9868 + }, + { + "epoch": 0.09869, + "grad_norm": 0.5085779141402881, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 9869 + }, + { + "epoch": 0.0987, + "grad_norm": 0.5428508446899835, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 9870 + }, + { + "epoch": 0.09871, + "grad_norm": 0.6546855703044885, + "learning_rate": 0.003, + "loss": 4.1347, + "step": 9871 + }, + { + "epoch": 0.09872, + "grad_norm": 0.6934739489178067, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 9872 + }, + { + "epoch": 0.09873, + "grad_norm": 0.6568023982508876, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 9873 + }, + { + "epoch": 0.09874, + "grad_norm": 0.7426087598644382, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 9874 + }, + { + "epoch": 0.09875, + "grad_norm": 0.9646739748869851, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 9875 + }, + { + "epoch": 0.09876, + "grad_norm": 1.1760726018799912, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 9876 + }, + { + "epoch": 0.09877, + "grad_norm": 1.032505678325432, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 9877 + }, + { + "epoch": 0.09878, + "grad_norm": 0.9520034852500903, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 9878 + }, + { + "epoch": 0.09879, + "grad_norm": 0.8304599751268602, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 9879 + }, + { + "epoch": 0.0988, + "grad_norm": 0.7973040789135167, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 9880 + }, + { + "epoch": 0.09881, + "grad_norm": 0.6749261652206336, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 9881 + }, + { + "epoch": 0.09882, + "grad_norm": 0.6720424310436138, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 9882 + }, + { + "epoch": 0.09883, + "grad_norm": 0.6552937594046303, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 9883 + }, + { + "epoch": 0.09884, + "grad_norm": 0.8385733301338101, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 9884 + }, + { + "epoch": 0.09885, + "grad_norm": 0.787180609022946, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 9885 + }, + { + "epoch": 0.09886, + "grad_norm": 0.7214374906120231, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 9886 + }, + { + "epoch": 0.09887, + "grad_norm": 0.813828996233088, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 9887 + }, + { + "epoch": 0.09888, + "grad_norm": 0.9395429521943715, + "learning_rate": 0.003, + "loss": 4.1274, + "step": 9888 + }, + { + "epoch": 0.09889, + "grad_norm": 0.9594036018439058, + "learning_rate": 0.003, + "loss": 4.082, + "step": 9889 + }, + { + "epoch": 0.0989, + "grad_norm": 1.0604047721378298, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 9890 + }, + { + "epoch": 0.09891, + "grad_norm": 0.9760792138626977, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 9891 + }, + { + "epoch": 0.09892, + "grad_norm": 0.8055680873064225, + "learning_rate": 0.003, + "loss": 4.067, + "step": 9892 + }, + { + "epoch": 0.09893, + "grad_norm": 0.7150553503473565, + "learning_rate": 0.003, + "loss": 4.094, + "step": 9893 + }, + { + "epoch": 0.09894, + "grad_norm": 0.5919754883801267, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 9894 + }, + { + "epoch": 0.09895, + "grad_norm": 0.6042264333876616, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 9895 + }, + { + "epoch": 0.09896, + "grad_norm": 0.5430950181914646, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 9896 + }, + { + "epoch": 0.09897, + "grad_norm": 0.7064510242910831, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 9897 + }, + { + "epoch": 0.09898, + "grad_norm": 0.8003106649351045, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 9898 + }, + { + "epoch": 0.09899, + "grad_norm": 0.8270519641387873, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 9899 + }, + { + "epoch": 0.099, + "grad_norm": 0.7540500751340824, + "learning_rate": 0.003, + "loss": 4.1119, + "step": 9900 + }, + { + "epoch": 0.09901, + "grad_norm": 0.6671609878427571, + "learning_rate": 0.003, + "loss": 4.091, + "step": 9901 + }, + { + "epoch": 0.09902, + "grad_norm": 0.7272528832102878, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 9902 + }, + { + "epoch": 0.09903, + "grad_norm": 0.749667895778945, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 9903 + }, + { + "epoch": 0.09904, + "grad_norm": 0.6741213580842577, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 9904 + }, + { + "epoch": 0.09905, + "grad_norm": 0.6736961275412793, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 9905 + }, + { + "epoch": 0.09906, + "grad_norm": 0.6560958029133749, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 9906 + }, + { + "epoch": 0.09907, + "grad_norm": 0.631422253019129, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 9907 + }, + { + "epoch": 0.09908, + "grad_norm": 0.5392615341333712, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 9908 + }, + { + "epoch": 0.09909, + "grad_norm": 0.5811455319662365, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 9909 + }, + { + "epoch": 0.0991, + "grad_norm": 0.6707319041356397, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 9910 + }, + { + "epoch": 0.09911, + "grad_norm": 0.771243416598569, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 9911 + }, + { + "epoch": 0.09912, + "grad_norm": 0.8496996867713535, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 9912 + }, + { + "epoch": 0.09913, + "grad_norm": 0.8902788189271055, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 9913 + }, + { + "epoch": 0.09914, + "grad_norm": 0.9357066721293495, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 9914 + }, + { + "epoch": 0.09915, + "grad_norm": 0.9755645470634543, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 9915 + }, + { + "epoch": 0.09916, + "grad_norm": 1.0089094696696197, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 9916 + }, + { + "epoch": 0.09917, + "grad_norm": 0.8817642996354145, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 9917 + }, + { + "epoch": 0.09918, + "grad_norm": 0.7658568526663542, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 9918 + }, + { + "epoch": 0.09919, + "grad_norm": 0.8128521592081074, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 9919 + }, + { + "epoch": 0.0992, + "grad_norm": 0.8609166031572244, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 9920 + }, + { + "epoch": 0.09921, + "grad_norm": 1.0125750989658047, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 9921 + }, + { + "epoch": 0.09922, + "grad_norm": 1.0032822006219375, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 9922 + }, + { + "epoch": 0.09923, + "grad_norm": 1.2307924439733817, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 9923 + }, + { + "epoch": 0.09924, + "grad_norm": 1.0090705722158624, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 9924 + }, + { + "epoch": 0.09925, + "grad_norm": 1.0212794524503392, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 9925 + }, + { + "epoch": 0.09926, + "grad_norm": 0.9853180213898505, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 9926 + }, + { + "epoch": 0.09927, + "grad_norm": 0.758327811881651, + "learning_rate": 0.003, + "loss": 4.1349, + "step": 9927 + }, + { + "epoch": 0.09928, + "grad_norm": 0.7659641009511424, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 9928 + }, + { + "epoch": 0.09929, + "grad_norm": 0.7504067878828342, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 9929 + }, + { + "epoch": 0.0993, + "grad_norm": 0.6947364493882249, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 9930 + }, + { + "epoch": 0.09931, + "grad_norm": 0.6761909114866678, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 9931 + }, + { + "epoch": 0.09932, + "grad_norm": 0.6135941560379718, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 9932 + }, + { + "epoch": 0.09933, + "grad_norm": 0.582085876114816, + "learning_rate": 0.003, + "loss": 4.091, + "step": 9933 + }, + { + "epoch": 0.09934, + "grad_norm": 0.6180077412932585, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 9934 + }, + { + "epoch": 0.09935, + "grad_norm": 0.6156612388696766, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 9935 + }, + { + "epoch": 0.09936, + "grad_norm": 0.6398409744821787, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 9936 + }, + { + "epoch": 0.09937, + "grad_norm": 0.6544534848994807, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 9937 + }, + { + "epoch": 0.09938, + "grad_norm": 0.8269077552442352, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 9938 + }, + { + "epoch": 0.09939, + "grad_norm": 0.9971115211192825, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 9939 + }, + { + "epoch": 0.0994, + "grad_norm": 0.9857415352546822, + "learning_rate": 0.003, + "loss": 4.1229, + "step": 9940 + }, + { + "epoch": 0.09941, + "grad_norm": 0.8137608150128142, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 9941 + }, + { + "epoch": 0.09942, + "grad_norm": 0.593147335054599, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 9942 + }, + { + "epoch": 0.09943, + "grad_norm": 0.5875666526054226, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 9943 + }, + { + "epoch": 0.09944, + "grad_norm": 0.5879211483291883, + "learning_rate": 0.003, + "loss": 4.112, + "step": 9944 + }, + { + "epoch": 0.09945, + "grad_norm": 0.6905176331929537, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 9945 + }, + { + "epoch": 0.09946, + "grad_norm": 0.7198664855118773, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 9946 + }, + { + "epoch": 0.09947, + "grad_norm": 0.752440611386409, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 9947 + }, + { + "epoch": 0.09948, + "grad_norm": 0.8192932165988773, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 9948 + }, + { + "epoch": 0.09949, + "grad_norm": 0.8209749794970975, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 9949 + }, + { + "epoch": 0.0995, + "grad_norm": 0.8358143036904804, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 9950 + }, + { + "epoch": 0.09951, + "grad_norm": 0.8867621716493573, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 9951 + }, + { + "epoch": 0.09952, + "grad_norm": 0.8466716064437273, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 9952 + }, + { + "epoch": 0.09953, + "grad_norm": 0.8254486041624173, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 9953 + }, + { + "epoch": 0.09954, + "grad_norm": 0.7712441746642007, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 9954 + }, + { + "epoch": 0.09955, + "grad_norm": 0.6903202853138538, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 9955 + }, + { + "epoch": 0.09956, + "grad_norm": 0.5906670263661717, + "learning_rate": 0.003, + "loss": 4.074, + "step": 9956 + }, + { + "epoch": 0.09957, + "grad_norm": 0.5743206687874738, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 9957 + }, + { + "epoch": 0.09958, + "grad_norm": 0.590105027476533, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 9958 + }, + { + "epoch": 0.09959, + "grad_norm": 0.6478146939289141, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 9959 + }, + { + "epoch": 0.0996, + "grad_norm": 0.6125903545494685, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 9960 + }, + { + "epoch": 0.09961, + "grad_norm": 0.6855179525277352, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 9961 + }, + { + "epoch": 0.09962, + "grad_norm": 0.8245686493079379, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 9962 + }, + { + "epoch": 0.09963, + "grad_norm": 1.1718745927375416, + "learning_rate": 0.003, + "loss": 4.106, + "step": 9963 + }, + { + "epoch": 0.09964, + "grad_norm": 0.7822113890013278, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 9964 + }, + { + "epoch": 0.09965, + "grad_norm": 0.6627168370652431, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 9965 + }, + { + "epoch": 0.09966, + "grad_norm": 0.6816975050009425, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 9966 + }, + { + "epoch": 0.09967, + "grad_norm": 0.6410163787617061, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 9967 + }, + { + "epoch": 0.09968, + "grad_norm": 0.5513934862668493, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 9968 + }, + { + "epoch": 0.09969, + "grad_norm": 0.5717861304898563, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 9969 + }, + { + "epoch": 0.0997, + "grad_norm": 0.6378222450638118, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 9970 + }, + { + "epoch": 0.09971, + "grad_norm": 0.6879760044616384, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 9971 + }, + { + "epoch": 0.09972, + "grad_norm": 0.7703686261003476, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 9972 + }, + { + "epoch": 0.09973, + "grad_norm": 0.8455904106463509, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 9973 + }, + { + "epoch": 0.09974, + "grad_norm": 0.9031224948235831, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 9974 + }, + { + "epoch": 0.09975, + "grad_norm": 0.7834229298155747, + "learning_rate": 0.003, + "loss": 4.091, + "step": 9975 + }, + { + "epoch": 0.09976, + "grad_norm": 0.7374608282517201, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 9976 + }, + { + "epoch": 0.09977, + "grad_norm": 0.6475022518581292, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 9977 + }, + { + "epoch": 0.09978, + "grad_norm": 0.6655914091026776, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 9978 + }, + { + "epoch": 0.09979, + "grad_norm": 0.6092837390948476, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 9979 + }, + { + "epoch": 0.0998, + "grad_norm": 0.6222297729643492, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 9980 + }, + { + "epoch": 0.09981, + "grad_norm": 0.5644346622119025, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 9981 + }, + { + "epoch": 0.09982, + "grad_norm": 0.5746206355689046, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 9982 + }, + { + "epoch": 0.09983, + "grad_norm": 0.6579561700974546, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 9983 + }, + { + "epoch": 0.09984, + "grad_norm": 0.9489942242805092, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 9984 + }, + { + "epoch": 0.09985, + "grad_norm": 1.5512495843402707, + "learning_rate": 0.003, + "loss": 4.1121, + "step": 9985 + }, + { + "epoch": 0.09986, + "grad_norm": 0.5642219821127367, + "learning_rate": 0.003, + "loss": 4.096, + "step": 9986 + }, + { + "epoch": 0.09987, + "grad_norm": 0.8878804991370798, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 9987 + }, + { + "epoch": 0.09988, + "grad_norm": 0.9616073363226949, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 9988 + }, + { + "epoch": 0.09989, + "grad_norm": 0.905824845539826, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 9989 + }, + { + "epoch": 0.0999, + "grad_norm": 0.8647017499059604, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 9990 + }, + { + "epoch": 0.09991, + "grad_norm": 0.8032896275364889, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 9991 + }, + { + "epoch": 0.09992, + "grad_norm": 0.7279383274198165, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 9992 + }, + { + "epoch": 0.09993, + "grad_norm": 0.7799466970112703, + "learning_rate": 0.003, + "loss": 4.085, + "step": 9993 + }, + { + "epoch": 0.09994, + "grad_norm": 0.9145234797734555, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 9994 + }, + { + "epoch": 0.09995, + "grad_norm": 0.8201103396095457, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 9995 + }, + { + "epoch": 0.09996, + "grad_norm": 0.9047552292555922, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 9996 + }, + { + "epoch": 0.09997, + "grad_norm": 0.8963430475859631, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 9997 + }, + { + "epoch": 0.09998, + "grad_norm": 0.813945518877336, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 9998 + }, + { + "epoch": 0.09999, + "grad_norm": 0.6549507314006496, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 9999 + }, + { + "epoch": 0.1, + "grad_norm": 0.6519137253369942, + "learning_rate": 0.003, + "loss": 4.097, + "step": 10000 + }, + { + "epoch": 0.10001, + "grad_norm": 0.6310538887421997, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 10001 + }, + { + "epoch": 0.10002, + "grad_norm": 0.6391259755212524, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 10002 + }, + { + "epoch": 0.10003, + "grad_norm": 0.5978190100029864, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 10003 + }, + { + "epoch": 0.10004, + "grad_norm": 0.7353677870344891, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 10004 + }, + { + "epoch": 0.10005, + "grad_norm": 0.8600701195416798, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 10005 + }, + { + "epoch": 0.10006, + "grad_norm": 0.7651339491329487, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 10006 + }, + { + "epoch": 0.10007, + "grad_norm": 0.8281866504989451, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 10007 + }, + { + "epoch": 0.10008, + "grad_norm": 0.898976147912649, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 10008 + }, + { + "epoch": 0.10009, + "grad_norm": 0.9235345887520816, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 10009 + }, + { + "epoch": 0.1001, + "grad_norm": 1.1045884205459653, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 10010 + }, + { + "epoch": 0.10011, + "grad_norm": 0.8047977290401939, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 10011 + }, + { + "epoch": 0.10012, + "grad_norm": 0.8491840634842334, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 10012 + }, + { + "epoch": 0.10013, + "grad_norm": 1.0828080137474874, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 10013 + }, + { + "epoch": 0.10014, + "grad_norm": 1.1635619875301892, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 10014 + }, + { + "epoch": 0.10015, + "grad_norm": 0.897354796776556, + "learning_rate": 0.003, + "loss": 4.1283, + "step": 10015 + }, + { + "epoch": 0.10016, + "grad_norm": 0.7780665867165246, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 10016 + }, + { + "epoch": 0.10017, + "grad_norm": 0.683817569961156, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 10017 + }, + { + "epoch": 0.10018, + "grad_norm": 0.7740532824418541, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 10018 + }, + { + "epoch": 0.10019, + "grad_norm": 0.8606450031261723, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 10019 + }, + { + "epoch": 0.1002, + "grad_norm": 0.808735430817491, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 10020 + }, + { + "epoch": 0.10021, + "grad_norm": 0.6620052775434282, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 10021 + }, + { + "epoch": 0.10022, + "grad_norm": 0.6154822144201489, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 10022 + }, + { + "epoch": 0.10023, + "grad_norm": 0.5187491125035918, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 10023 + }, + { + "epoch": 0.10024, + "grad_norm": 0.4942144582638463, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 10024 + }, + { + "epoch": 0.10025, + "grad_norm": 0.46654780612751734, + "learning_rate": 0.003, + "loss": 4.085, + "step": 10025 + }, + { + "epoch": 0.10026, + "grad_norm": 0.5004543480891657, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 10026 + }, + { + "epoch": 0.10027, + "grad_norm": 0.5513832529483311, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 10027 + }, + { + "epoch": 0.10028, + "grad_norm": 0.6352605183788917, + "learning_rate": 0.003, + "loss": 4.1138, + "step": 10028 + }, + { + "epoch": 0.10029, + "grad_norm": 0.8367870123940754, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 10029 + }, + { + "epoch": 0.1003, + "grad_norm": 1.0581826188086585, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 10030 + }, + { + "epoch": 0.10031, + "grad_norm": 1.0220085066989417, + "learning_rate": 0.003, + "loss": 4.1279, + "step": 10031 + }, + { + "epoch": 0.10032, + "grad_norm": 0.7783369834157056, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 10032 + }, + { + "epoch": 0.10033, + "grad_norm": 0.6709417988465439, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 10033 + }, + { + "epoch": 0.10034, + "grad_norm": 0.6669434095115762, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 10034 + }, + { + "epoch": 0.10035, + "grad_norm": 0.6147566105234249, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 10035 + }, + { + "epoch": 0.10036, + "grad_norm": 0.6529090399469691, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 10036 + }, + { + "epoch": 0.10037, + "grad_norm": 0.7107482522994412, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 10037 + }, + { + "epoch": 0.10038, + "grad_norm": 0.8197362150703974, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 10038 + }, + { + "epoch": 0.10039, + "grad_norm": 0.8295877869862686, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 10039 + }, + { + "epoch": 0.1004, + "grad_norm": 0.7374421686069658, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 10040 + }, + { + "epoch": 0.10041, + "grad_norm": 0.8214317115707387, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 10041 + }, + { + "epoch": 0.10042, + "grad_norm": 0.895440358479686, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 10042 + }, + { + "epoch": 0.10043, + "grad_norm": 0.9993692769809046, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 10043 + }, + { + "epoch": 0.10044, + "grad_norm": 1.072699158101983, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 10044 + }, + { + "epoch": 0.10045, + "grad_norm": 0.9388431886106355, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 10045 + }, + { + "epoch": 0.10046, + "grad_norm": 1.030884690149999, + "learning_rate": 0.003, + "loss": 4.114, + "step": 10046 + }, + { + "epoch": 0.10047, + "grad_norm": 1.0068468868079046, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 10047 + }, + { + "epoch": 0.10048, + "grad_norm": 0.7705600866643636, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 10048 + }, + { + "epoch": 0.10049, + "grad_norm": 0.8194123489368415, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 10049 + }, + { + "epoch": 0.1005, + "grad_norm": 0.869020771465238, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 10050 + }, + { + "epoch": 0.10051, + "grad_norm": 0.9307225832163902, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 10051 + }, + { + "epoch": 0.10052, + "grad_norm": 0.8621096104565541, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 10052 + }, + { + "epoch": 0.10053, + "grad_norm": 0.7248964302923027, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 10053 + }, + { + "epoch": 0.10054, + "grad_norm": 0.6696966942680252, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 10054 + }, + { + "epoch": 0.10055, + "grad_norm": 0.7706194800493945, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 10055 + }, + { + "epoch": 0.10056, + "grad_norm": 0.7476737464111006, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 10056 + }, + { + "epoch": 0.10057, + "grad_norm": 0.7734457893757593, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 10057 + }, + { + "epoch": 0.10058, + "grad_norm": 0.678252649945835, + "learning_rate": 0.003, + "loss": 4.096, + "step": 10058 + }, + { + "epoch": 0.10059, + "grad_norm": 0.6595095553472218, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 10059 + }, + { + "epoch": 0.1006, + "grad_norm": 0.7002337222649138, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 10060 + }, + { + "epoch": 0.10061, + "grad_norm": 0.7739912428589064, + "learning_rate": 0.003, + "loss": 4.072, + "step": 10061 + }, + { + "epoch": 0.10062, + "grad_norm": 0.8094580611217377, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 10062 + }, + { + "epoch": 0.10063, + "grad_norm": 0.7849235571724628, + "learning_rate": 0.003, + "loss": 4.074, + "step": 10063 + }, + { + "epoch": 0.10064, + "grad_norm": 0.71688513935201, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 10064 + }, + { + "epoch": 0.10065, + "grad_norm": 0.5387402250397292, + "learning_rate": 0.003, + "loss": 4.103, + "step": 10065 + }, + { + "epoch": 0.10066, + "grad_norm": 0.5738573805956311, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 10066 + }, + { + "epoch": 0.10067, + "grad_norm": 0.5128916053318231, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 10067 + }, + { + "epoch": 0.10068, + "grad_norm": 0.5303861773665701, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 10068 + }, + { + "epoch": 0.10069, + "grad_norm": 0.5363307400406016, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 10069 + }, + { + "epoch": 0.1007, + "grad_norm": 0.6677766567970822, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 10070 + }, + { + "epoch": 0.10071, + "grad_norm": 0.869800192886798, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 10071 + }, + { + "epoch": 0.10072, + "grad_norm": 1.241986467031261, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 10072 + }, + { + "epoch": 0.10073, + "grad_norm": 0.7044633440415046, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 10073 + }, + { + "epoch": 0.10074, + "grad_norm": 0.6016954396918823, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 10074 + }, + { + "epoch": 0.10075, + "grad_norm": 0.6506653532205113, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 10075 + }, + { + "epoch": 0.10076, + "grad_norm": 0.5913298031674521, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 10076 + }, + { + "epoch": 0.10077, + "grad_norm": 0.6612064518297085, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 10077 + }, + { + "epoch": 0.10078, + "grad_norm": 0.7381204773241105, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 10078 + }, + { + "epoch": 0.10079, + "grad_norm": 0.9136392204855105, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 10079 + }, + { + "epoch": 0.1008, + "grad_norm": 1.0551033651714792, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 10080 + }, + { + "epoch": 0.10081, + "grad_norm": 1.021241306196213, + "learning_rate": 0.003, + "loss": 4.1138, + "step": 10081 + }, + { + "epoch": 0.10082, + "grad_norm": 0.9444310820937741, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 10082 + }, + { + "epoch": 0.10083, + "grad_norm": 0.8028394635763569, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 10083 + }, + { + "epoch": 0.10084, + "grad_norm": 0.6931291147221774, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 10084 + }, + { + "epoch": 0.10085, + "grad_norm": 0.6614930979877484, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 10085 + }, + { + "epoch": 0.10086, + "grad_norm": 0.6447028805806914, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 10086 + }, + { + "epoch": 0.10087, + "grad_norm": 0.7271300834569303, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 10087 + }, + { + "epoch": 0.10088, + "grad_norm": 0.8548799986148488, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 10088 + }, + { + "epoch": 0.10089, + "grad_norm": 0.8660557367235694, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 10089 + }, + { + "epoch": 0.1009, + "grad_norm": 0.8905488162446568, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 10090 + }, + { + "epoch": 0.10091, + "grad_norm": 0.9129923392768977, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 10091 + }, + { + "epoch": 0.10092, + "grad_norm": 0.9486091293972858, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 10092 + }, + { + "epoch": 0.10093, + "grad_norm": 0.9202478176786889, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 10093 + }, + { + "epoch": 0.10094, + "grad_norm": 0.9444438231215719, + "learning_rate": 0.003, + "loss": 4.084, + "step": 10094 + }, + { + "epoch": 0.10095, + "grad_norm": 1.0023114763189855, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 10095 + }, + { + "epoch": 0.10096, + "grad_norm": 0.9008830005116453, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 10096 + }, + { + "epoch": 0.10097, + "grad_norm": 0.7394970221020885, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 10097 + }, + { + "epoch": 0.10098, + "grad_norm": 0.7119640016754663, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 10098 + }, + { + "epoch": 0.10099, + "grad_norm": 0.6795956150432717, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 10099 + }, + { + "epoch": 0.101, + "grad_norm": 0.7985963421953124, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 10100 + }, + { + "epoch": 0.10101, + "grad_norm": 0.9676465329947534, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 10101 + }, + { + "epoch": 0.10102, + "grad_norm": 0.9402257720277828, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 10102 + }, + { + "epoch": 0.10103, + "grad_norm": 0.8874925029060675, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 10103 + }, + { + "epoch": 0.10104, + "grad_norm": 0.9117348708084444, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 10104 + }, + { + "epoch": 0.10105, + "grad_norm": 0.8479646908158324, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 10105 + }, + { + "epoch": 0.10106, + "grad_norm": 0.896593809928579, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 10106 + }, + { + "epoch": 0.10107, + "grad_norm": 0.8315621037343224, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 10107 + }, + { + "epoch": 0.10108, + "grad_norm": 0.7681718745034675, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 10108 + }, + { + "epoch": 0.10109, + "grad_norm": 0.8233892495101346, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 10109 + }, + { + "epoch": 0.1011, + "grad_norm": 0.9016640807163642, + "learning_rate": 0.003, + "loss": 4.1142, + "step": 10110 + }, + { + "epoch": 0.10111, + "grad_norm": 0.9170672273091978, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 10111 + }, + { + "epoch": 0.10112, + "grad_norm": 0.8351702169958964, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 10112 + }, + { + "epoch": 0.10113, + "grad_norm": 0.8089461545174494, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 10113 + }, + { + "epoch": 0.10114, + "grad_norm": 0.9107383212481057, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 10114 + }, + { + "epoch": 0.10115, + "grad_norm": 0.8118768018834102, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 10115 + }, + { + "epoch": 0.10116, + "grad_norm": 0.7983571219601531, + "learning_rate": 0.003, + "loss": 4.091, + "step": 10116 + }, + { + "epoch": 0.10117, + "grad_norm": 0.7951177221933025, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 10117 + }, + { + "epoch": 0.10118, + "grad_norm": 0.8413030315926632, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 10118 + }, + { + "epoch": 0.10119, + "grad_norm": 0.716518761890454, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 10119 + }, + { + "epoch": 0.1012, + "grad_norm": 0.6486824283257824, + "learning_rate": 0.003, + "loss": 4.074, + "step": 10120 + }, + { + "epoch": 0.10121, + "grad_norm": 0.5880476149618893, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 10121 + }, + { + "epoch": 0.10122, + "grad_norm": 0.5866107750288472, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 10122 + }, + { + "epoch": 0.10123, + "grad_norm": 0.5811881253993841, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 10123 + }, + { + "epoch": 0.10124, + "grad_norm": 0.5569420366913549, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 10124 + }, + { + "epoch": 0.10125, + "grad_norm": 0.5942996797715101, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 10125 + }, + { + "epoch": 0.10126, + "grad_norm": 0.5670677013779793, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 10126 + }, + { + "epoch": 0.10127, + "grad_norm": 0.529089784800278, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 10127 + }, + { + "epoch": 0.10128, + "grad_norm": 0.4168566029164349, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 10128 + }, + { + "epoch": 0.10129, + "grad_norm": 0.4311731795710161, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 10129 + }, + { + "epoch": 0.1013, + "grad_norm": 0.4477041043125615, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 10130 + }, + { + "epoch": 0.10131, + "grad_norm": 0.5027821507640442, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 10131 + }, + { + "epoch": 0.10132, + "grad_norm": 0.5493559629395345, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 10132 + }, + { + "epoch": 0.10133, + "grad_norm": 0.6452629987298875, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 10133 + }, + { + "epoch": 0.10134, + "grad_norm": 0.8688443183887481, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 10134 + }, + { + "epoch": 0.10135, + "grad_norm": 1.1410657478991406, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 10135 + }, + { + "epoch": 0.10136, + "grad_norm": 0.923735834137093, + "learning_rate": 0.003, + "loss": 4.062, + "step": 10136 + }, + { + "epoch": 0.10137, + "grad_norm": 0.7985668392366627, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 10137 + }, + { + "epoch": 0.10138, + "grad_norm": 0.6893008020293587, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 10138 + }, + { + "epoch": 0.10139, + "grad_norm": 0.699027589097982, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 10139 + }, + { + "epoch": 0.1014, + "grad_norm": 0.7173481888193998, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 10140 + }, + { + "epoch": 0.10141, + "grad_norm": 0.6959173004694686, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 10141 + }, + { + "epoch": 0.10142, + "grad_norm": 0.813260923915381, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 10142 + }, + { + "epoch": 0.10143, + "grad_norm": 0.8889759915027781, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 10143 + }, + { + "epoch": 0.10144, + "grad_norm": 0.6930759800585614, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 10144 + }, + { + "epoch": 0.10145, + "grad_norm": 0.7031416657156874, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 10145 + }, + { + "epoch": 0.10146, + "grad_norm": 0.8186555112243362, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 10146 + }, + { + "epoch": 0.10147, + "grad_norm": 0.972515461562128, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 10147 + }, + { + "epoch": 0.10148, + "grad_norm": 1.2251324017059566, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 10148 + }, + { + "epoch": 0.10149, + "grad_norm": 0.7575767344901424, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 10149 + }, + { + "epoch": 0.1015, + "grad_norm": 0.7257739095105269, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 10150 + }, + { + "epoch": 0.10151, + "grad_norm": 0.6850837341823302, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 10151 + }, + { + "epoch": 0.10152, + "grad_norm": 0.6901837020889886, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 10152 + }, + { + "epoch": 0.10153, + "grad_norm": 0.7366995404122453, + "learning_rate": 0.003, + "loss": 4.089, + "step": 10153 + }, + { + "epoch": 0.10154, + "grad_norm": 0.7729200614601484, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 10154 + }, + { + "epoch": 0.10155, + "grad_norm": 0.9179351051298269, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 10155 + }, + { + "epoch": 0.10156, + "grad_norm": 1.079772577207657, + "learning_rate": 0.003, + "loss": 4.078, + "step": 10156 + }, + { + "epoch": 0.10157, + "grad_norm": 0.8478718907398735, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 10157 + }, + { + "epoch": 0.10158, + "grad_norm": 0.8076358725971214, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 10158 + }, + { + "epoch": 0.10159, + "grad_norm": 0.7182692438467507, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 10159 + }, + { + "epoch": 0.1016, + "grad_norm": 0.6354632732608569, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 10160 + }, + { + "epoch": 0.10161, + "grad_norm": 0.6096925827913215, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 10161 + }, + { + "epoch": 0.10162, + "grad_norm": 0.6412613510772903, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 10162 + }, + { + "epoch": 0.10163, + "grad_norm": 0.7228497893130814, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 10163 + }, + { + "epoch": 0.10164, + "grad_norm": 0.8189338574542239, + "learning_rate": 0.003, + "loss": 4.107, + "step": 10164 + }, + { + "epoch": 0.10165, + "grad_norm": 0.8802770642728063, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 10165 + }, + { + "epoch": 0.10166, + "grad_norm": 0.9340783981991572, + "learning_rate": 0.003, + "loss": 4.102, + "step": 10166 + }, + { + "epoch": 0.10167, + "grad_norm": 1.0522661488882454, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 10167 + }, + { + "epoch": 0.10168, + "grad_norm": 1.0883984803163758, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 10168 + }, + { + "epoch": 0.10169, + "grad_norm": 0.8718130916305842, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 10169 + }, + { + "epoch": 0.1017, + "grad_norm": 0.7267906765452536, + "learning_rate": 0.003, + "loss": 4.09, + "step": 10170 + }, + { + "epoch": 0.10171, + "grad_norm": 0.6741776316878704, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 10171 + }, + { + "epoch": 0.10172, + "grad_norm": 0.6366479102607301, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 10172 + }, + { + "epoch": 0.10173, + "grad_norm": 0.6255451070036055, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 10173 + }, + { + "epoch": 0.10174, + "grad_norm": 0.6833151370840097, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 10174 + }, + { + "epoch": 0.10175, + "grad_norm": 0.6980219428980601, + "learning_rate": 0.003, + "loss": 4.073, + "step": 10175 + }, + { + "epoch": 0.10176, + "grad_norm": 0.7375496501515941, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 10176 + }, + { + "epoch": 0.10177, + "grad_norm": 0.7898802215772395, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 10177 + }, + { + "epoch": 0.10178, + "grad_norm": 0.9839723040732539, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 10178 + }, + { + "epoch": 0.10179, + "grad_norm": 1.0558006196476055, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 10179 + }, + { + "epoch": 0.1018, + "grad_norm": 0.8839567388022763, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 10180 + }, + { + "epoch": 0.10181, + "grad_norm": 0.8323042106193205, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 10181 + }, + { + "epoch": 0.10182, + "grad_norm": 0.8791136712281088, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 10182 + }, + { + "epoch": 0.10183, + "grad_norm": 1.0917668790659707, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 10183 + }, + { + "epoch": 0.10184, + "grad_norm": 0.9025576624348437, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 10184 + }, + { + "epoch": 0.10185, + "grad_norm": 0.7798371617092108, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 10185 + }, + { + "epoch": 0.10186, + "grad_norm": 0.7713060566227249, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 10186 + }, + { + "epoch": 0.10187, + "grad_norm": 0.7563056494272968, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 10187 + }, + { + "epoch": 0.10188, + "grad_norm": 0.7772228424671542, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 10188 + }, + { + "epoch": 0.10189, + "grad_norm": 0.8260486212304557, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 10189 + }, + { + "epoch": 0.1019, + "grad_norm": 1.1171480943910872, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 10190 + }, + { + "epoch": 0.10191, + "grad_norm": 0.9843614175223876, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 10191 + }, + { + "epoch": 0.10192, + "grad_norm": 1.036663948755249, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 10192 + }, + { + "epoch": 0.10193, + "grad_norm": 0.8972801457141254, + "learning_rate": 0.003, + "loss": 4.097, + "step": 10193 + }, + { + "epoch": 0.10194, + "grad_norm": 0.8235311205411696, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 10194 + }, + { + "epoch": 0.10195, + "grad_norm": 0.709463331336577, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 10195 + }, + { + "epoch": 0.10196, + "grad_norm": 0.5235487875342617, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 10196 + }, + { + "epoch": 0.10197, + "grad_norm": 0.5246146886760584, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 10197 + }, + { + "epoch": 0.10198, + "grad_norm": 0.5530686360133628, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 10198 + }, + { + "epoch": 0.10199, + "grad_norm": 0.6533458323226683, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 10199 + }, + { + "epoch": 0.102, + "grad_norm": 0.8138884437121796, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 10200 + }, + { + "epoch": 0.10201, + "grad_norm": 0.8833147158811808, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 10201 + }, + { + "epoch": 0.10202, + "grad_norm": 0.8602495065910606, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 10202 + }, + { + "epoch": 0.10203, + "grad_norm": 0.8167686182900835, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 10203 + }, + { + "epoch": 0.10204, + "grad_norm": 0.69579025866685, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 10204 + }, + { + "epoch": 0.10205, + "grad_norm": 0.6952695693816464, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 10205 + }, + { + "epoch": 0.10206, + "grad_norm": 0.7178836748410159, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 10206 + }, + { + "epoch": 0.10207, + "grad_norm": 0.7051292005693832, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 10207 + }, + { + "epoch": 0.10208, + "grad_norm": 0.7123121092039982, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 10208 + }, + { + "epoch": 0.10209, + "grad_norm": 0.7357546073327017, + "learning_rate": 0.003, + "loss": 4.097, + "step": 10209 + }, + { + "epoch": 0.1021, + "grad_norm": 0.738960921368254, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 10210 + }, + { + "epoch": 0.10211, + "grad_norm": 0.9042302755628594, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 10211 + }, + { + "epoch": 0.10212, + "grad_norm": 0.9328835687364458, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 10212 + }, + { + "epoch": 0.10213, + "grad_norm": 1.064128555829592, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 10213 + }, + { + "epoch": 0.10214, + "grad_norm": 0.8423985032180068, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 10214 + }, + { + "epoch": 0.10215, + "grad_norm": 0.6894767482193619, + "learning_rate": 0.003, + "loss": 4.107, + "step": 10215 + }, + { + "epoch": 0.10216, + "grad_norm": 0.6407174824026426, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 10216 + }, + { + "epoch": 0.10217, + "grad_norm": 0.6930631872988513, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 10217 + }, + { + "epoch": 0.10218, + "grad_norm": 0.6736003865724515, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 10218 + }, + { + "epoch": 0.10219, + "grad_norm": 0.6391476717729888, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 10219 + }, + { + "epoch": 0.1022, + "grad_norm": 0.626075339436956, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 10220 + }, + { + "epoch": 0.10221, + "grad_norm": 0.6109786482845987, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 10221 + }, + { + "epoch": 0.10222, + "grad_norm": 0.6282338053842801, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 10222 + }, + { + "epoch": 0.10223, + "grad_norm": 0.6832246627842549, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 10223 + }, + { + "epoch": 0.10224, + "grad_norm": 0.7169113704064414, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 10224 + }, + { + "epoch": 0.10225, + "grad_norm": 0.7323081942187616, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 10225 + }, + { + "epoch": 0.10226, + "grad_norm": 0.7406008465439835, + "learning_rate": 0.003, + "loss": 4.1302, + "step": 10226 + }, + { + "epoch": 0.10227, + "grad_norm": 0.785218333980764, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 10227 + }, + { + "epoch": 0.10228, + "grad_norm": 0.9898486890834313, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 10228 + }, + { + "epoch": 0.10229, + "grad_norm": 1.0742073532465095, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 10229 + }, + { + "epoch": 0.1023, + "grad_norm": 0.7265290601522981, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 10230 + }, + { + "epoch": 0.10231, + "grad_norm": 0.7367427007561755, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 10231 + }, + { + "epoch": 0.10232, + "grad_norm": 0.7481518210705496, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 10232 + }, + { + "epoch": 0.10233, + "grad_norm": 0.8005412014419764, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 10233 + }, + { + "epoch": 0.10234, + "grad_norm": 0.8303364545297103, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 10234 + }, + { + "epoch": 0.10235, + "grad_norm": 0.8310317993635913, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 10235 + }, + { + "epoch": 0.10236, + "grad_norm": 0.9054391176791641, + "learning_rate": 0.003, + "loss": 4.127, + "step": 10236 + }, + { + "epoch": 0.10237, + "grad_norm": 0.8435763363330704, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 10237 + }, + { + "epoch": 0.10238, + "grad_norm": 0.8115475750168918, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 10238 + }, + { + "epoch": 0.10239, + "grad_norm": 0.8628909482258778, + "learning_rate": 0.003, + "loss": 4.1279, + "step": 10239 + }, + { + "epoch": 0.1024, + "grad_norm": 0.9364777145567061, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 10240 + }, + { + "epoch": 0.10241, + "grad_norm": 1.146465968559167, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 10241 + }, + { + "epoch": 0.10242, + "grad_norm": 0.854898755875994, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 10242 + }, + { + "epoch": 0.10243, + "grad_norm": 0.7511718291044952, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 10243 + }, + { + "epoch": 0.10244, + "grad_norm": 0.7603646485366432, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 10244 + }, + { + "epoch": 0.10245, + "grad_norm": 0.8105837774316208, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 10245 + }, + { + "epoch": 0.10246, + "grad_norm": 0.7977585495767636, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 10246 + }, + { + "epoch": 0.10247, + "grad_norm": 0.7506978997504432, + "learning_rate": 0.003, + "loss": 4.1188, + "step": 10247 + }, + { + "epoch": 0.10248, + "grad_norm": 0.8759386294051426, + "learning_rate": 0.003, + "loss": 4.115, + "step": 10248 + }, + { + "epoch": 0.10249, + "grad_norm": 0.8431799228218545, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 10249 + }, + { + "epoch": 0.1025, + "grad_norm": 0.7864842623687947, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 10250 + }, + { + "epoch": 0.10251, + "grad_norm": 0.7740703845382346, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 10251 + }, + { + "epoch": 0.10252, + "grad_norm": 0.8321898093447786, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 10252 + }, + { + "epoch": 0.10253, + "grad_norm": 0.7894410369300876, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 10253 + }, + { + "epoch": 0.10254, + "grad_norm": 0.8133697650165276, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 10254 + }, + { + "epoch": 0.10255, + "grad_norm": 0.744829951039707, + "learning_rate": 0.003, + "loss": 4.082, + "step": 10255 + }, + { + "epoch": 0.10256, + "grad_norm": 0.7780534762171264, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 10256 + }, + { + "epoch": 0.10257, + "grad_norm": 0.8808046453133321, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 10257 + }, + { + "epoch": 0.10258, + "grad_norm": 0.9129780513694674, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 10258 + }, + { + "epoch": 0.10259, + "grad_norm": 0.8386788452118967, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 10259 + }, + { + "epoch": 0.1026, + "grad_norm": 0.7688367051425977, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 10260 + }, + { + "epoch": 0.10261, + "grad_norm": 0.6828172453485549, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 10261 + }, + { + "epoch": 0.10262, + "grad_norm": 0.6027470278842928, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 10262 + }, + { + "epoch": 0.10263, + "grad_norm": 0.6134625435010462, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 10263 + }, + { + "epoch": 0.10264, + "grad_norm": 0.6601613262512446, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 10264 + }, + { + "epoch": 0.10265, + "grad_norm": 0.6925742701606704, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 10265 + }, + { + "epoch": 0.10266, + "grad_norm": 0.7494871872755563, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 10266 + }, + { + "epoch": 0.10267, + "grad_norm": 0.7626957921447522, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 10267 + }, + { + "epoch": 0.10268, + "grad_norm": 0.8706405052637655, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 10268 + }, + { + "epoch": 0.10269, + "grad_norm": 0.9926896776195115, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 10269 + }, + { + "epoch": 0.1027, + "grad_norm": 0.9916626731747161, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 10270 + }, + { + "epoch": 0.10271, + "grad_norm": 0.96407920565692, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 10271 + }, + { + "epoch": 0.10272, + "grad_norm": 0.8960155500010829, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 10272 + }, + { + "epoch": 0.10273, + "grad_norm": 0.7192574545668858, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 10273 + }, + { + "epoch": 0.10274, + "grad_norm": 0.7829507156964814, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 10274 + }, + { + "epoch": 0.10275, + "grad_norm": 0.8067743579386221, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 10275 + }, + { + "epoch": 0.10276, + "grad_norm": 0.9012713038007449, + "learning_rate": 0.003, + "loss": 4.095, + "step": 10276 + }, + { + "epoch": 0.10277, + "grad_norm": 1.0017053952207466, + "learning_rate": 0.003, + "loss": 4.087, + "step": 10277 + }, + { + "epoch": 0.10278, + "grad_norm": 1.1196366403325528, + "learning_rate": 0.003, + "loss": 4.1008, + "step": 10278 + }, + { + "epoch": 0.10279, + "grad_norm": 0.7036241002443819, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 10279 + }, + { + "epoch": 0.1028, + "grad_norm": 0.675873333438298, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 10280 + }, + { + "epoch": 0.10281, + "grad_norm": 0.7188694294424975, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 10281 + }, + { + "epoch": 0.10282, + "grad_norm": 0.6458206776287501, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 10282 + }, + { + "epoch": 0.10283, + "grad_norm": 0.6503790584246892, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 10283 + }, + { + "epoch": 0.10284, + "grad_norm": 0.7512150394872587, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 10284 + }, + { + "epoch": 0.10285, + "grad_norm": 0.8041418564413038, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 10285 + }, + { + "epoch": 0.10286, + "grad_norm": 0.7393185584957023, + "learning_rate": 0.003, + "loss": 4.085, + "step": 10286 + }, + { + "epoch": 0.10287, + "grad_norm": 0.6536497052383766, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 10287 + }, + { + "epoch": 0.10288, + "grad_norm": 0.6895525064317385, + "learning_rate": 0.003, + "loss": 4.1179, + "step": 10288 + }, + { + "epoch": 0.10289, + "grad_norm": 0.71141474436633, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 10289 + }, + { + "epoch": 0.1029, + "grad_norm": 0.7863696654596326, + "learning_rate": 0.003, + "loss": 4.1034, + "step": 10290 + }, + { + "epoch": 0.10291, + "grad_norm": 0.7433309300210272, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 10291 + }, + { + "epoch": 0.10292, + "grad_norm": 0.7228574492515069, + "learning_rate": 0.003, + "loss": 4.1068, + "step": 10292 + }, + { + "epoch": 0.10293, + "grad_norm": 0.7328543775889537, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 10293 + }, + { + "epoch": 0.10294, + "grad_norm": 0.7119459365331811, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 10294 + }, + { + "epoch": 0.10295, + "grad_norm": 0.6902154836264375, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 10295 + }, + { + "epoch": 0.10296, + "grad_norm": 0.6673321189400658, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 10296 + }, + { + "epoch": 0.10297, + "grad_norm": 0.6206236886964629, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 10297 + }, + { + "epoch": 0.10298, + "grad_norm": 0.580549324398935, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 10298 + }, + { + "epoch": 0.10299, + "grad_norm": 0.5568108267250715, + "learning_rate": 0.003, + "loss": 4.074, + "step": 10299 + }, + { + "epoch": 0.103, + "grad_norm": 0.6459850251191824, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 10300 + }, + { + "epoch": 0.10301, + "grad_norm": 0.7870413569880143, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 10301 + }, + { + "epoch": 0.10302, + "grad_norm": 0.9424107245640844, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 10302 + }, + { + "epoch": 0.10303, + "grad_norm": 1.0280411938586382, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 10303 + }, + { + "epoch": 0.10304, + "grad_norm": 1.1122147913714624, + "learning_rate": 0.003, + "loss": 4.105, + "step": 10304 + }, + { + "epoch": 0.10305, + "grad_norm": 0.935437875996847, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 10305 + }, + { + "epoch": 0.10306, + "grad_norm": 0.9434216723466147, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 10306 + }, + { + "epoch": 0.10307, + "grad_norm": 1.077419206259069, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 10307 + }, + { + "epoch": 0.10308, + "grad_norm": 0.9260799956690462, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 10308 + }, + { + "epoch": 0.10309, + "grad_norm": 0.788475483287031, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 10309 + }, + { + "epoch": 0.1031, + "grad_norm": 0.7548519322564272, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 10310 + }, + { + "epoch": 0.10311, + "grad_norm": 0.8490099710145201, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 10311 + }, + { + "epoch": 0.10312, + "grad_norm": 0.9754889415393432, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 10312 + }, + { + "epoch": 0.10313, + "grad_norm": 0.9725068876707715, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 10313 + }, + { + "epoch": 0.10314, + "grad_norm": 1.008302977550403, + "learning_rate": 0.003, + "loss": 4.1094, + "step": 10314 + }, + { + "epoch": 0.10315, + "grad_norm": 1.1068450571626192, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 10315 + }, + { + "epoch": 0.10316, + "grad_norm": 1.01689260859524, + "learning_rate": 0.003, + "loss": 4.1353, + "step": 10316 + }, + { + "epoch": 0.10317, + "grad_norm": 1.0399611525497334, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 10317 + }, + { + "epoch": 0.10318, + "grad_norm": 1.0664621024838172, + "learning_rate": 0.003, + "loss": 4.092, + "step": 10318 + }, + { + "epoch": 0.10319, + "grad_norm": 0.8887351487113996, + "learning_rate": 0.003, + "loss": 4.106, + "step": 10319 + }, + { + "epoch": 0.1032, + "grad_norm": 0.9777260247175278, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 10320 + }, + { + "epoch": 0.10321, + "grad_norm": 1.0855049175533027, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 10321 + }, + { + "epoch": 0.10322, + "grad_norm": 0.963506360245415, + "learning_rate": 0.003, + "loss": 4.1389, + "step": 10322 + }, + { + "epoch": 0.10323, + "grad_norm": 0.9881045744868919, + "learning_rate": 0.003, + "loss": 4.1225, + "step": 10323 + }, + { + "epoch": 0.10324, + "grad_norm": 0.8742451143437673, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 10324 + }, + { + "epoch": 0.10325, + "grad_norm": 0.7592965201742712, + "learning_rate": 0.003, + "loss": 4.098, + "step": 10325 + }, + { + "epoch": 0.10326, + "grad_norm": 0.6924762638567085, + "learning_rate": 0.003, + "loss": 4.111, + "step": 10326 + }, + { + "epoch": 0.10327, + "grad_norm": 0.7996862828960193, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 10327 + }, + { + "epoch": 0.10328, + "grad_norm": 0.8472096443774338, + "learning_rate": 0.003, + "loss": 4.087, + "step": 10328 + }, + { + "epoch": 0.10329, + "grad_norm": 0.80863735984161, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 10329 + }, + { + "epoch": 0.1033, + "grad_norm": 0.7470728233214627, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 10330 + }, + { + "epoch": 0.10331, + "grad_norm": 0.6111385808771548, + "learning_rate": 0.003, + "loss": 4.077, + "step": 10331 + }, + { + "epoch": 0.10332, + "grad_norm": 0.5181245173636219, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 10332 + }, + { + "epoch": 0.10333, + "grad_norm": 0.5288066580362742, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 10333 + }, + { + "epoch": 0.10334, + "grad_norm": 0.5522664037256619, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 10334 + }, + { + "epoch": 0.10335, + "grad_norm": 0.6137655038412625, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 10335 + }, + { + "epoch": 0.10336, + "grad_norm": 0.7105459077980683, + "learning_rate": 0.003, + "loss": 4.1034, + "step": 10336 + }, + { + "epoch": 0.10337, + "grad_norm": 0.8033414113689806, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 10337 + }, + { + "epoch": 0.10338, + "grad_norm": 0.76450882628852, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 10338 + }, + { + "epoch": 0.10339, + "grad_norm": 0.6766193484175521, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 10339 + }, + { + "epoch": 0.1034, + "grad_norm": 0.7046081840372489, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 10340 + }, + { + "epoch": 0.10341, + "grad_norm": 0.6881196394385097, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 10341 + }, + { + "epoch": 0.10342, + "grad_norm": 0.7019620860313454, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 10342 + }, + { + "epoch": 0.10343, + "grad_norm": 0.7669457179418919, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 10343 + }, + { + "epoch": 0.10344, + "grad_norm": 0.8562986464162007, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 10344 + }, + { + "epoch": 0.10345, + "grad_norm": 0.9678576326676694, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 10345 + }, + { + "epoch": 0.10346, + "grad_norm": 0.988840553225034, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 10346 + }, + { + "epoch": 0.10347, + "grad_norm": 0.8761665279793409, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 10347 + }, + { + "epoch": 0.10348, + "grad_norm": 0.8070465433399605, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 10348 + }, + { + "epoch": 0.10349, + "grad_norm": 0.746677695053773, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 10349 + }, + { + "epoch": 0.1035, + "grad_norm": 0.7251075414951131, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 10350 + }, + { + "epoch": 0.10351, + "grad_norm": 0.682523058274381, + "learning_rate": 0.003, + "loss": 4.093, + "step": 10351 + }, + { + "epoch": 0.10352, + "grad_norm": 0.5549826583410974, + "learning_rate": 0.003, + "loss": 4.078, + "step": 10352 + }, + { + "epoch": 0.10353, + "grad_norm": 0.6184831584530538, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 10353 + }, + { + "epoch": 0.10354, + "grad_norm": 0.6154810565062979, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 10354 + }, + { + "epoch": 0.10355, + "grad_norm": 0.6013274566749466, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 10355 + }, + { + "epoch": 0.10356, + "grad_norm": 0.5790494966726321, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 10356 + }, + { + "epoch": 0.10357, + "grad_norm": 0.5815749905417382, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 10357 + }, + { + "epoch": 0.10358, + "grad_norm": 0.5961417322223677, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 10358 + }, + { + "epoch": 0.10359, + "grad_norm": 0.612228084547915, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 10359 + }, + { + "epoch": 0.1036, + "grad_norm": 0.5632606844307874, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 10360 + }, + { + "epoch": 0.10361, + "grad_norm": 0.5944178213854017, + "learning_rate": 0.003, + "loss": 4.036, + "step": 10361 + }, + { + "epoch": 0.10362, + "grad_norm": 0.6050895286964695, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 10362 + }, + { + "epoch": 0.10363, + "grad_norm": 0.5893502071647355, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 10363 + }, + { + "epoch": 0.10364, + "grad_norm": 0.6480921596779424, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 10364 + }, + { + "epoch": 0.10365, + "grad_norm": 0.6655516728542986, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 10365 + }, + { + "epoch": 0.10366, + "grad_norm": 0.841939927056256, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 10366 + }, + { + "epoch": 0.10367, + "grad_norm": 1.2925940203305464, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 10367 + }, + { + "epoch": 0.10368, + "grad_norm": 1.0721423681105018, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 10368 + }, + { + "epoch": 0.10369, + "grad_norm": 0.8054815871522366, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 10369 + }, + { + "epoch": 0.1037, + "grad_norm": 0.6431254709101457, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 10370 + }, + { + "epoch": 0.10371, + "grad_norm": 0.6771185643178579, + "learning_rate": 0.003, + "loss": 4.049, + "step": 10371 + }, + { + "epoch": 0.10372, + "grad_norm": 0.6505722328550112, + "learning_rate": 0.003, + "loss": 4.061, + "step": 10372 + }, + { + "epoch": 0.10373, + "grad_norm": 0.649150897107484, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 10373 + }, + { + "epoch": 0.10374, + "grad_norm": 0.7655192327120619, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 10374 + }, + { + "epoch": 0.10375, + "grad_norm": 0.8276458451965146, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 10375 + }, + { + "epoch": 0.10376, + "grad_norm": 0.8401543111084332, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 10376 + }, + { + "epoch": 0.10377, + "grad_norm": 0.961981488825057, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 10377 + }, + { + "epoch": 0.10378, + "grad_norm": 1.0528802151842687, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 10378 + }, + { + "epoch": 0.10379, + "grad_norm": 0.9122713415226914, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 10379 + }, + { + "epoch": 0.1038, + "grad_norm": 0.8191396318548235, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 10380 + }, + { + "epoch": 0.10381, + "grad_norm": 0.7785335876041576, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 10381 + }, + { + "epoch": 0.10382, + "grad_norm": 0.8640344513391571, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 10382 + }, + { + "epoch": 0.10383, + "grad_norm": 1.0100767424979953, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 10383 + }, + { + "epoch": 0.10384, + "grad_norm": 1.0586243617599906, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 10384 + }, + { + "epoch": 0.10385, + "grad_norm": 0.8624828769831347, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 10385 + }, + { + "epoch": 0.10386, + "grad_norm": 0.8830596935983357, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 10386 + }, + { + "epoch": 0.10387, + "grad_norm": 0.8306326059978796, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 10387 + }, + { + "epoch": 0.10388, + "grad_norm": 0.8272101216281712, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 10388 + }, + { + "epoch": 0.10389, + "grad_norm": 0.9009021769238307, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 10389 + }, + { + "epoch": 0.1039, + "grad_norm": 1.0391034226876843, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 10390 + }, + { + "epoch": 0.10391, + "grad_norm": 0.9858486051647656, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 10391 + }, + { + "epoch": 0.10392, + "grad_norm": 0.817170907122026, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 10392 + }, + { + "epoch": 0.10393, + "grad_norm": 0.7813731942805362, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 10393 + }, + { + "epoch": 0.10394, + "grad_norm": 0.7491777949113574, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 10394 + }, + { + "epoch": 0.10395, + "grad_norm": 0.9168636740102267, + "learning_rate": 0.003, + "loss": 4.1249, + "step": 10395 + }, + { + "epoch": 0.10396, + "grad_norm": 1.0881876341009609, + "learning_rate": 0.003, + "loss": 4.1147, + "step": 10396 + }, + { + "epoch": 0.10397, + "grad_norm": 0.8483418535870134, + "learning_rate": 0.003, + "loss": 4.1204, + "step": 10397 + }, + { + "epoch": 0.10398, + "grad_norm": 0.8004552390402355, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 10398 + }, + { + "epoch": 0.10399, + "grad_norm": 1.0550255797082628, + "learning_rate": 0.003, + "loss": 4.12, + "step": 10399 + }, + { + "epoch": 0.104, + "grad_norm": 1.1469901095195558, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 10400 + }, + { + "epoch": 0.10401, + "grad_norm": 0.776163770261994, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 10401 + }, + { + "epoch": 0.10402, + "grad_norm": 0.7191911818065254, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 10402 + }, + { + "epoch": 0.10403, + "grad_norm": 0.7757679538550868, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 10403 + }, + { + "epoch": 0.10404, + "grad_norm": 0.9017847369719004, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 10404 + }, + { + "epoch": 0.10405, + "grad_norm": 1.0205196229352549, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 10405 + }, + { + "epoch": 0.10406, + "grad_norm": 1.0561034007451116, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 10406 + }, + { + "epoch": 0.10407, + "grad_norm": 0.8382012374833445, + "learning_rate": 0.003, + "loss": 4.1182, + "step": 10407 + }, + { + "epoch": 0.10408, + "grad_norm": 0.6440441204977746, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 10408 + }, + { + "epoch": 0.10409, + "grad_norm": 0.6231554547472592, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 10409 + }, + { + "epoch": 0.1041, + "grad_norm": 0.5895655256675674, + "learning_rate": 0.003, + "loss": 4.1169, + "step": 10410 + }, + { + "epoch": 0.10411, + "grad_norm": 0.6078869262220562, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 10411 + }, + { + "epoch": 0.10412, + "grad_norm": 0.5714544500792731, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 10412 + }, + { + "epoch": 0.10413, + "grad_norm": 0.4888734408355746, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 10413 + }, + { + "epoch": 0.10414, + "grad_norm": 0.45177378648238387, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 10414 + }, + { + "epoch": 0.10415, + "grad_norm": 0.4464755195762157, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 10415 + }, + { + "epoch": 0.10416, + "grad_norm": 0.4119578434609014, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 10416 + }, + { + "epoch": 0.10417, + "grad_norm": 0.4530981054257146, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 10417 + }, + { + "epoch": 0.10418, + "grad_norm": 0.5625746495824158, + "learning_rate": 0.003, + "loss": 4.055, + "step": 10418 + }, + { + "epoch": 0.10419, + "grad_norm": 0.7189451960829651, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 10419 + }, + { + "epoch": 0.1042, + "grad_norm": 0.9831700376646376, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 10420 + }, + { + "epoch": 0.10421, + "grad_norm": 1.1445532836627261, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 10421 + }, + { + "epoch": 0.10422, + "grad_norm": 0.7082563767107952, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 10422 + }, + { + "epoch": 0.10423, + "grad_norm": 0.6874627848858307, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 10423 + }, + { + "epoch": 0.10424, + "grad_norm": 0.730815564511028, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 10424 + }, + { + "epoch": 0.10425, + "grad_norm": 0.6651685498466663, + "learning_rate": 0.003, + "loss": 4.077, + "step": 10425 + }, + { + "epoch": 0.10426, + "grad_norm": 0.629940706978873, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 10426 + }, + { + "epoch": 0.10427, + "grad_norm": 0.662314993288643, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 10427 + }, + { + "epoch": 0.10428, + "grad_norm": 0.7008715707921093, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 10428 + }, + { + "epoch": 0.10429, + "grad_norm": 0.774968411885718, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 10429 + }, + { + "epoch": 0.1043, + "grad_norm": 0.7183561816195599, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 10430 + }, + { + "epoch": 0.10431, + "grad_norm": 0.7603457171655063, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 10431 + }, + { + "epoch": 0.10432, + "grad_norm": 0.79045224783535, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 10432 + }, + { + "epoch": 0.10433, + "grad_norm": 0.9599162387244943, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 10433 + }, + { + "epoch": 0.10434, + "grad_norm": 0.8214963345150575, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 10434 + }, + { + "epoch": 0.10435, + "grad_norm": 0.7767373236471652, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 10435 + }, + { + "epoch": 0.10436, + "grad_norm": 0.7865430681912782, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 10436 + }, + { + "epoch": 0.10437, + "grad_norm": 0.7933364160947659, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 10437 + }, + { + "epoch": 0.10438, + "grad_norm": 0.7277652813568619, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 10438 + }, + { + "epoch": 0.10439, + "grad_norm": 0.6839207059096676, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 10439 + }, + { + "epoch": 0.1044, + "grad_norm": 0.6746530544011542, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 10440 + }, + { + "epoch": 0.10441, + "grad_norm": 0.7864041796225456, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 10441 + }, + { + "epoch": 0.10442, + "grad_norm": 0.8804608511981953, + "learning_rate": 0.003, + "loss": 4.069, + "step": 10442 + }, + { + "epoch": 0.10443, + "grad_norm": 1.0508300123164835, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 10443 + }, + { + "epoch": 0.10444, + "grad_norm": 1.051811976800046, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 10444 + }, + { + "epoch": 0.10445, + "grad_norm": 0.7888503995506437, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 10445 + }, + { + "epoch": 0.10446, + "grad_norm": 0.6603911887857837, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 10446 + }, + { + "epoch": 0.10447, + "grad_norm": 0.8408603404059548, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 10447 + }, + { + "epoch": 0.10448, + "grad_norm": 0.9695074227496694, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 10448 + }, + { + "epoch": 0.10449, + "grad_norm": 0.9822010322827212, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 10449 + }, + { + "epoch": 0.1045, + "grad_norm": 1.0012725608752013, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 10450 + }, + { + "epoch": 0.10451, + "grad_norm": 1.009392360148877, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 10451 + }, + { + "epoch": 0.10452, + "grad_norm": 1.0090387388437994, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 10452 + }, + { + "epoch": 0.10453, + "grad_norm": 0.8943599349231653, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 10453 + }, + { + "epoch": 0.10454, + "grad_norm": 0.8560707064477248, + "learning_rate": 0.003, + "loss": 4.083, + "step": 10454 + }, + { + "epoch": 0.10455, + "grad_norm": 0.8752369980580909, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 10455 + }, + { + "epoch": 0.10456, + "grad_norm": 0.9130498227436277, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 10456 + }, + { + "epoch": 0.10457, + "grad_norm": 0.8067284336559211, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 10457 + }, + { + "epoch": 0.10458, + "grad_norm": 0.7395291417558032, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 10458 + }, + { + "epoch": 0.10459, + "grad_norm": 0.7297382370955492, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 10459 + }, + { + "epoch": 0.1046, + "grad_norm": 0.8594227462287262, + "learning_rate": 0.003, + "loss": 4.1102, + "step": 10460 + }, + { + "epoch": 0.10461, + "grad_norm": 0.8861318685991226, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 10461 + }, + { + "epoch": 0.10462, + "grad_norm": 0.8083739717834618, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 10462 + }, + { + "epoch": 0.10463, + "grad_norm": 0.855447088915927, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 10463 + }, + { + "epoch": 0.10464, + "grad_norm": 0.8309265331948995, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 10464 + }, + { + "epoch": 0.10465, + "grad_norm": 0.8950247767863003, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 10465 + }, + { + "epoch": 0.10466, + "grad_norm": 0.9702711674796212, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 10466 + }, + { + "epoch": 0.10467, + "grad_norm": 1.0808015279280165, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 10467 + }, + { + "epoch": 0.10468, + "grad_norm": 0.8796165704446247, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 10468 + }, + { + "epoch": 0.10469, + "grad_norm": 0.6864191337528118, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 10469 + }, + { + "epoch": 0.1047, + "grad_norm": 0.642911293497494, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 10470 + }, + { + "epoch": 0.10471, + "grad_norm": 0.6401286669093053, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 10471 + }, + { + "epoch": 0.10472, + "grad_norm": 0.7623940021861698, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 10472 + }, + { + "epoch": 0.10473, + "grad_norm": 0.7757677108816563, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 10473 + }, + { + "epoch": 0.10474, + "grad_norm": 0.8165254615464338, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 10474 + }, + { + "epoch": 0.10475, + "grad_norm": 0.6947649086407327, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 10475 + }, + { + "epoch": 0.10476, + "grad_norm": 0.609454645718345, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 10476 + }, + { + "epoch": 0.10477, + "grad_norm": 0.5156181671942854, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 10477 + }, + { + "epoch": 0.10478, + "grad_norm": 0.550404325359368, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 10478 + }, + { + "epoch": 0.10479, + "grad_norm": 0.5844190234215697, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 10479 + }, + { + "epoch": 0.1048, + "grad_norm": 0.6475683647990985, + "learning_rate": 0.003, + "loss": 4.062, + "step": 10480 + }, + { + "epoch": 0.10481, + "grad_norm": 0.765493534126949, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 10481 + }, + { + "epoch": 0.10482, + "grad_norm": 0.7460812863968737, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 10482 + }, + { + "epoch": 0.10483, + "grad_norm": 0.760367664758415, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 10483 + }, + { + "epoch": 0.10484, + "grad_norm": 0.7889324440591884, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 10484 + }, + { + "epoch": 0.10485, + "grad_norm": 0.7000125978773567, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 10485 + }, + { + "epoch": 0.10486, + "grad_norm": 0.697394489855323, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 10486 + }, + { + "epoch": 0.10487, + "grad_norm": 0.7734234905452742, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 10487 + }, + { + "epoch": 0.10488, + "grad_norm": 0.9578169878035425, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 10488 + }, + { + "epoch": 0.10489, + "grad_norm": 1.1043883747627603, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 10489 + }, + { + "epoch": 0.1049, + "grad_norm": 0.8806546383022705, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 10490 + }, + { + "epoch": 0.10491, + "grad_norm": 0.8581015926193526, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 10491 + }, + { + "epoch": 0.10492, + "grad_norm": 0.8562928611424699, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 10492 + }, + { + "epoch": 0.10493, + "grad_norm": 0.8058684115768378, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 10493 + }, + { + "epoch": 0.10494, + "grad_norm": 0.7770554096388587, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 10494 + }, + { + "epoch": 0.10495, + "grad_norm": 0.7334547382322241, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 10495 + }, + { + "epoch": 0.10496, + "grad_norm": 0.718416025393358, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 10496 + }, + { + "epoch": 0.10497, + "grad_norm": 0.7792435850503384, + "learning_rate": 0.003, + "loss": 4.067, + "step": 10497 + }, + { + "epoch": 0.10498, + "grad_norm": 0.8412922868706377, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 10498 + }, + { + "epoch": 0.10499, + "grad_norm": 0.9509337789006239, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 10499 + }, + { + "epoch": 0.105, + "grad_norm": 0.9912844508190685, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 10500 + }, + { + "epoch": 0.10501, + "grad_norm": 0.8580834861139071, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 10501 + }, + { + "epoch": 0.10502, + "grad_norm": 0.8167382602075896, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 10502 + }, + { + "epoch": 0.10503, + "grad_norm": 0.8743323585619591, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 10503 + }, + { + "epoch": 0.10504, + "grad_norm": 0.8327457707468423, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 10504 + }, + { + "epoch": 0.10505, + "grad_norm": 0.801391957128637, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 10505 + }, + { + "epoch": 0.10506, + "grad_norm": 0.6394241038195827, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 10506 + }, + { + "epoch": 0.10507, + "grad_norm": 0.6433059411947507, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 10507 + }, + { + "epoch": 0.10508, + "grad_norm": 0.6555621284685157, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 10508 + }, + { + "epoch": 0.10509, + "grad_norm": 0.6181176365447185, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 10509 + }, + { + "epoch": 0.1051, + "grad_norm": 0.65166605534626, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 10510 + }, + { + "epoch": 0.10511, + "grad_norm": 0.7356173809364867, + "learning_rate": 0.003, + "loss": 4.052, + "step": 10511 + }, + { + "epoch": 0.10512, + "grad_norm": 1.013079842957836, + "learning_rate": 0.003, + "loss": 4.1161, + "step": 10512 + }, + { + "epoch": 0.10513, + "grad_norm": 1.1755143739073186, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 10513 + }, + { + "epoch": 0.10514, + "grad_norm": 0.8163557674329753, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 10514 + }, + { + "epoch": 0.10515, + "grad_norm": 0.7085350074934578, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 10515 + }, + { + "epoch": 0.10516, + "grad_norm": 0.6658791858624886, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 10516 + }, + { + "epoch": 0.10517, + "grad_norm": 0.6041515015315367, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 10517 + }, + { + "epoch": 0.10518, + "grad_norm": 0.7047204808794802, + "learning_rate": 0.003, + "loss": 4.102, + "step": 10518 + }, + { + "epoch": 0.10519, + "grad_norm": 0.7244412290591962, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 10519 + }, + { + "epoch": 0.1052, + "grad_norm": 0.7395861089702314, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 10520 + }, + { + "epoch": 0.10521, + "grad_norm": 0.7665016692200823, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 10521 + }, + { + "epoch": 0.10522, + "grad_norm": 0.7792629094064418, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 10522 + }, + { + "epoch": 0.10523, + "grad_norm": 0.7953210132825478, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 10523 + }, + { + "epoch": 0.10524, + "grad_norm": 0.7991555641259503, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 10524 + }, + { + "epoch": 0.10525, + "grad_norm": 0.8238402147042898, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 10525 + }, + { + "epoch": 0.10526, + "grad_norm": 0.7163491665622135, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 10526 + }, + { + "epoch": 0.10527, + "grad_norm": 0.6545420151814723, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 10527 + }, + { + "epoch": 0.10528, + "grad_norm": 0.7166141333279368, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 10528 + }, + { + "epoch": 0.10529, + "grad_norm": 0.9100729617540565, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 10529 + }, + { + "epoch": 0.1053, + "grad_norm": 1.0028018301025132, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 10530 + }, + { + "epoch": 0.10531, + "grad_norm": 1.0807675817900035, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 10531 + }, + { + "epoch": 0.10532, + "grad_norm": 0.8870026361329219, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 10532 + }, + { + "epoch": 0.10533, + "grad_norm": 0.893246559986549, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 10533 + }, + { + "epoch": 0.10534, + "grad_norm": 0.8877419641098879, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 10534 + }, + { + "epoch": 0.10535, + "grad_norm": 0.804120621184703, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 10535 + }, + { + "epoch": 0.10536, + "grad_norm": 0.731737971690129, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 10536 + }, + { + "epoch": 0.10537, + "grad_norm": 0.6685167604634952, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 10537 + }, + { + "epoch": 0.10538, + "grad_norm": 0.6041586651576828, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 10538 + }, + { + "epoch": 0.10539, + "grad_norm": 0.5722575391865711, + "learning_rate": 0.003, + "loss": 4.072, + "step": 10539 + }, + { + "epoch": 0.1054, + "grad_norm": 0.6001113282044517, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 10540 + }, + { + "epoch": 0.10541, + "grad_norm": 0.5762315876118429, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 10541 + }, + { + "epoch": 0.10542, + "grad_norm": 0.563179289202697, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 10542 + }, + { + "epoch": 0.10543, + "grad_norm": 0.5947299652169051, + "learning_rate": 0.003, + "loss": 4.072, + "step": 10543 + }, + { + "epoch": 0.10544, + "grad_norm": 0.6985463762361143, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 10544 + }, + { + "epoch": 0.10545, + "grad_norm": 0.8488458192713703, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 10545 + }, + { + "epoch": 0.10546, + "grad_norm": 1.134283035666868, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 10546 + }, + { + "epoch": 0.10547, + "grad_norm": 1.008621876461993, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 10547 + }, + { + "epoch": 0.10548, + "grad_norm": 0.8910932087633671, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 10548 + }, + { + "epoch": 0.10549, + "grad_norm": 0.809160397609863, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 10549 + }, + { + "epoch": 0.1055, + "grad_norm": 0.8218623812846083, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 10550 + }, + { + "epoch": 0.10551, + "grad_norm": 0.7992528000231008, + "learning_rate": 0.003, + "loss": 4.107, + "step": 10551 + }, + { + "epoch": 0.10552, + "grad_norm": 0.86719344382475, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 10552 + }, + { + "epoch": 0.10553, + "grad_norm": 0.908060487944853, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 10553 + }, + { + "epoch": 0.10554, + "grad_norm": 0.8652695049802602, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 10554 + }, + { + "epoch": 0.10555, + "grad_norm": 0.6860164989967895, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 10555 + }, + { + "epoch": 0.10556, + "grad_norm": 0.6378293392988055, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 10556 + }, + { + "epoch": 0.10557, + "grad_norm": 0.62800965514661, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 10557 + }, + { + "epoch": 0.10558, + "grad_norm": 0.6963803928423222, + "learning_rate": 0.003, + "loss": 4.062, + "step": 10558 + }, + { + "epoch": 0.10559, + "grad_norm": 0.6865270024171043, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 10559 + }, + { + "epoch": 0.1056, + "grad_norm": 0.737499509213761, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 10560 + }, + { + "epoch": 0.10561, + "grad_norm": 0.8984522058594936, + "learning_rate": 0.003, + "loss": 4.077, + "step": 10561 + }, + { + "epoch": 0.10562, + "grad_norm": 1.1339394563977538, + "learning_rate": 0.003, + "loss": 4.066, + "step": 10562 + }, + { + "epoch": 0.10563, + "grad_norm": 0.8198771181383394, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 10563 + }, + { + "epoch": 0.10564, + "grad_norm": 0.7658877274749886, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 10564 + }, + { + "epoch": 0.10565, + "grad_norm": 0.8482606637522958, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 10565 + }, + { + "epoch": 0.10566, + "grad_norm": 0.8347786010621645, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 10566 + }, + { + "epoch": 0.10567, + "grad_norm": 0.8843860041980399, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 10567 + }, + { + "epoch": 0.10568, + "grad_norm": 0.990139802584295, + "learning_rate": 0.003, + "loss": 4.1173, + "step": 10568 + }, + { + "epoch": 0.10569, + "grad_norm": 0.9852662752867085, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 10569 + }, + { + "epoch": 0.1057, + "grad_norm": 0.9372987172274819, + "learning_rate": 0.003, + "loss": 4.084, + "step": 10570 + }, + { + "epoch": 0.10571, + "grad_norm": 0.988820397043408, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 10571 + }, + { + "epoch": 0.10572, + "grad_norm": 1.1678833347420248, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 10572 + }, + { + "epoch": 0.10573, + "grad_norm": 0.8517913634074189, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 10573 + }, + { + "epoch": 0.10574, + "grad_norm": 0.8659007094767834, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 10574 + }, + { + "epoch": 0.10575, + "grad_norm": 0.958183815886997, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 10575 + }, + { + "epoch": 0.10576, + "grad_norm": 0.9118557877430638, + "learning_rate": 0.003, + "loss": 4.1164, + "step": 10576 + }, + { + "epoch": 0.10577, + "grad_norm": 0.9153226156451957, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 10577 + }, + { + "epoch": 0.10578, + "grad_norm": 0.9874735654072453, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 10578 + }, + { + "epoch": 0.10579, + "grad_norm": 1.0415079742593492, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 10579 + }, + { + "epoch": 0.1058, + "grad_norm": 0.9153784660111088, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 10580 + }, + { + "epoch": 0.10581, + "grad_norm": 0.9210262014080384, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 10581 + }, + { + "epoch": 0.10582, + "grad_norm": 0.9396705464198691, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 10582 + }, + { + "epoch": 0.10583, + "grad_norm": 0.9120879354056952, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 10583 + }, + { + "epoch": 0.10584, + "grad_norm": 0.8070610834160225, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 10584 + }, + { + "epoch": 0.10585, + "grad_norm": 0.8983503783260497, + "learning_rate": 0.003, + "loss": 4.1378, + "step": 10585 + }, + { + "epoch": 0.10586, + "grad_norm": 0.942723580765965, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 10586 + }, + { + "epoch": 0.10587, + "grad_norm": 0.9654495812888432, + "learning_rate": 0.003, + "loss": 4.1261, + "step": 10587 + }, + { + "epoch": 0.10588, + "grad_norm": 1.0137540717282463, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 10588 + }, + { + "epoch": 0.10589, + "grad_norm": 0.9211083933242845, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 10589 + }, + { + "epoch": 0.1059, + "grad_norm": 0.8886434692979224, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 10590 + }, + { + "epoch": 0.10591, + "grad_norm": 0.8414672791491735, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 10591 + }, + { + "epoch": 0.10592, + "grad_norm": 0.808437412569294, + "learning_rate": 0.003, + "loss": 4.1363, + "step": 10592 + }, + { + "epoch": 0.10593, + "grad_norm": 0.6746674356577256, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 10593 + }, + { + "epoch": 0.10594, + "grad_norm": 0.6222040276891654, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 10594 + }, + { + "epoch": 0.10595, + "grad_norm": 0.6313983056054177, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 10595 + }, + { + "epoch": 0.10596, + "grad_norm": 0.6792929100891301, + "learning_rate": 0.003, + "loss": 4.097, + "step": 10596 + }, + { + "epoch": 0.10597, + "grad_norm": 0.6800719059268772, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 10597 + }, + { + "epoch": 0.10598, + "grad_norm": 0.770312479924072, + "learning_rate": 0.003, + "loss": 4.097, + "step": 10598 + }, + { + "epoch": 0.10599, + "grad_norm": 0.9764320147050246, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 10599 + }, + { + "epoch": 0.106, + "grad_norm": 0.9114216015686166, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 10600 + }, + { + "epoch": 0.10601, + "grad_norm": 0.6555189332915664, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 10601 + }, + { + "epoch": 0.10602, + "grad_norm": 0.5040307133652577, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 10602 + }, + { + "epoch": 0.10603, + "grad_norm": 0.5690484358230466, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 10603 + }, + { + "epoch": 0.10604, + "grad_norm": 0.5637735595856799, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 10604 + }, + { + "epoch": 0.10605, + "grad_norm": 0.48001727986025794, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 10605 + }, + { + "epoch": 0.10606, + "grad_norm": 0.4919296874299588, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 10606 + }, + { + "epoch": 0.10607, + "grad_norm": 0.4727741741475219, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 10607 + }, + { + "epoch": 0.10608, + "grad_norm": 0.5184442406668994, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 10608 + }, + { + "epoch": 0.10609, + "grad_norm": 0.5858630638238149, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 10609 + }, + { + "epoch": 0.1061, + "grad_norm": 0.6874353934764682, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 10610 + }, + { + "epoch": 0.10611, + "grad_norm": 0.8263741726587102, + "learning_rate": 0.003, + "loss": 4.041, + "step": 10611 + }, + { + "epoch": 0.10612, + "grad_norm": 0.8884315684622063, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 10612 + }, + { + "epoch": 0.10613, + "grad_norm": 0.8763453508937509, + "learning_rate": 0.003, + "loss": 4.069, + "step": 10613 + }, + { + "epoch": 0.10614, + "grad_norm": 0.8224298838859537, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 10614 + }, + { + "epoch": 0.10615, + "grad_norm": 0.7191046159980217, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 10615 + }, + { + "epoch": 0.10616, + "grad_norm": 0.7315824929912534, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 10616 + }, + { + "epoch": 0.10617, + "grad_norm": 0.916956377366291, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 10617 + }, + { + "epoch": 0.10618, + "grad_norm": 1.2629005890508869, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 10618 + }, + { + "epoch": 0.10619, + "grad_norm": 0.856578006061621, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 10619 + }, + { + "epoch": 0.1062, + "grad_norm": 0.7408131912141622, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 10620 + }, + { + "epoch": 0.10621, + "grad_norm": 0.6470727798627832, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 10621 + }, + { + "epoch": 0.10622, + "grad_norm": 0.5725004840385046, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 10622 + }, + { + "epoch": 0.10623, + "grad_norm": 0.553432799759805, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 10623 + }, + { + "epoch": 0.10624, + "grad_norm": 0.5403048416895829, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 10624 + }, + { + "epoch": 0.10625, + "grad_norm": 0.5104072292295226, + "learning_rate": 0.003, + "loss": 4.06, + "step": 10625 + }, + { + "epoch": 0.10626, + "grad_norm": 0.5823780352699905, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 10626 + }, + { + "epoch": 0.10627, + "grad_norm": 0.6962228994056245, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 10627 + }, + { + "epoch": 0.10628, + "grad_norm": 0.7935604816074494, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 10628 + }, + { + "epoch": 0.10629, + "grad_norm": 0.9695291887403658, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 10629 + }, + { + "epoch": 0.1063, + "grad_norm": 1.2894223340608182, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 10630 + }, + { + "epoch": 0.10631, + "grad_norm": 0.6621419698664075, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 10631 + }, + { + "epoch": 0.10632, + "grad_norm": 0.8074981429308187, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 10632 + }, + { + "epoch": 0.10633, + "grad_norm": 0.9026339646908454, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 10633 + }, + { + "epoch": 0.10634, + "grad_norm": 0.8417206941624089, + "learning_rate": 0.003, + "loss": 4.078, + "step": 10634 + }, + { + "epoch": 0.10635, + "grad_norm": 0.8020780791146946, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 10635 + }, + { + "epoch": 0.10636, + "grad_norm": 0.8075945654041848, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 10636 + }, + { + "epoch": 0.10637, + "grad_norm": 0.7764114257102779, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 10637 + }, + { + "epoch": 0.10638, + "grad_norm": 0.8095214640783682, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 10638 + }, + { + "epoch": 0.10639, + "grad_norm": 0.8784677433844998, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 10639 + }, + { + "epoch": 0.1064, + "grad_norm": 0.8355273252342663, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 10640 + }, + { + "epoch": 0.10641, + "grad_norm": 0.9458422852748879, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 10641 + }, + { + "epoch": 0.10642, + "grad_norm": 1.068264830551457, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 10642 + }, + { + "epoch": 0.10643, + "grad_norm": 1.0523298329122281, + "learning_rate": 0.003, + "loss": 4.072, + "step": 10643 + }, + { + "epoch": 0.10644, + "grad_norm": 0.7659982628957405, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 10644 + }, + { + "epoch": 0.10645, + "grad_norm": 0.652728924657321, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 10645 + }, + { + "epoch": 0.10646, + "grad_norm": 0.6456066494246264, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 10646 + }, + { + "epoch": 0.10647, + "grad_norm": 0.7464907408983558, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 10647 + }, + { + "epoch": 0.10648, + "grad_norm": 0.7405212525064351, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 10648 + }, + { + "epoch": 0.10649, + "grad_norm": 0.7478639631456179, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 10649 + }, + { + "epoch": 0.1065, + "grad_norm": 0.7550955604366715, + "learning_rate": 0.003, + "loss": 4.1162, + "step": 10650 + }, + { + "epoch": 0.10651, + "grad_norm": 0.7110846298696153, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 10651 + }, + { + "epoch": 0.10652, + "grad_norm": 0.6402684922431795, + "learning_rate": 0.003, + "loss": 4.084, + "step": 10652 + }, + { + "epoch": 0.10653, + "grad_norm": 0.59791029327084, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 10653 + }, + { + "epoch": 0.10654, + "grad_norm": 0.6432662968161444, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 10654 + }, + { + "epoch": 0.10655, + "grad_norm": 0.6752567227234482, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 10655 + }, + { + "epoch": 0.10656, + "grad_norm": 0.6800244754630034, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 10656 + }, + { + "epoch": 0.10657, + "grad_norm": 0.5959513716804604, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 10657 + }, + { + "epoch": 0.10658, + "grad_norm": 0.6403859324045968, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 10658 + }, + { + "epoch": 0.10659, + "grad_norm": 0.8296924742157811, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 10659 + }, + { + "epoch": 0.1066, + "grad_norm": 1.0682115948141957, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 10660 + }, + { + "epoch": 0.10661, + "grad_norm": 1.1194993070021568, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 10661 + }, + { + "epoch": 0.10662, + "grad_norm": 0.9275062203048765, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 10662 + }, + { + "epoch": 0.10663, + "grad_norm": 0.8557195585149507, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 10663 + }, + { + "epoch": 0.10664, + "grad_norm": 0.7332891166349356, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 10664 + }, + { + "epoch": 0.10665, + "grad_norm": 0.7737620390856565, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 10665 + }, + { + "epoch": 0.10666, + "grad_norm": 0.8490683224003708, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 10666 + }, + { + "epoch": 0.10667, + "grad_norm": 0.8834112771739413, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 10667 + }, + { + "epoch": 0.10668, + "grad_norm": 0.9844633934429818, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 10668 + }, + { + "epoch": 0.10669, + "grad_norm": 0.932517734016524, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 10669 + }, + { + "epoch": 0.1067, + "grad_norm": 0.83578599313614, + "learning_rate": 0.003, + "loss": 4.073, + "step": 10670 + }, + { + "epoch": 0.10671, + "grad_norm": 0.9069085658024532, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 10671 + }, + { + "epoch": 0.10672, + "grad_norm": 0.9945862570463592, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 10672 + }, + { + "epoch": 0.10673, + "grad_norm": 1.09965107012104, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 10673 + }, + { + "epoch": 0.10674, + "grad_norm": 0.8268182899214279, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 10674 + }, + { + "epoch": 0.10675, + "grad_norm": 0.7034549781067129, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 10675 + }, + { + "epoch": 0.10676, + "grad_norm": 0.7890775923714155, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 10676 + }, + { + "epoch": 0.10677, + "grad_norm": 0.7900757896942041, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 10677 + }, + { + "epoch": 0.10678, + "grad_norm": 0.734976551432324, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 10678 + }, + { + "epoch": 0.10679, + "grad_norm": 0.7314744669978124, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 10679 + }, + { + "epoch": 0.1068, + "grad_norm": 0.6908817611613471, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 10680 + }, + { + "epoch": 0.10681, + "grad_norm": 0.6399548375986829, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 10681 + }, + { + "epoch": 0.10682, + "grad_norm": 0.7053930398751589, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 10682 + }, + { + "epoch": 0.10683, + "grad_norm": 0.8259488173340027, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 10683 + }, + { + "epoch": 0.10684, + "grad_norm": 0.8721003907230183, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 10684 + }, + { + "epoch": 0.10685, + "grad_norm": 0.8470217654691111, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 10685 + }, + { + "epoch": 0.10686, + "grad_norm": 0.90992443878563, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 10686 + }, + { + "epoch": 0.10687, + "grad_norm": 1.024100970580944, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 10687 + }, + { + "epoch": 0.10688, + "grad_norm": 1.0637806320585521, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 10688 + }, + { + "epoch": 0.10689, + "grad_norm": 1.1156946627292235, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 10689 + }, + { + "epoch": 0.1069, + "grad_norm": 0.8046943265183353, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 10690 + }, + { + "epoch": 0.10691, + "grad_norm": 0.7100711170524753, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 10691 + }, + { + "epoch": 0.10692, + "grad_norm": 0.6574317863899862, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 10692 + }, + { + "epoch": 0.10693, + "grad_norm": 0.6251555571305705, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 10693 + }, + { + "epoch": 0.10694, + "grad_norm": 0.679376390140605, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 10694 + }, + { + "epoch": 0.10695, + "grad_norm": 0.8280371560374931, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 10695 + }, + { + "epoch": 0.10696, + "grad_norm": 0.9932001330147407, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 10696 + }, + { + "epoch": 0.10697, + "grad_norm": 1.1062910064134184, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 10697 + }, + { + "epoch": 0.10698, + "grad_norm": 0.7722807520515808, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 10698 + }, + { + "epoch": 0.10699, + "grad_norm": 0.7081182958148078, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 10699 + }, + { + "epoch": 0.107, + "grad_norm": 0.7166968485485714, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 10700 + }, + { + "epoch": 0.10701, + "grad_norm": 0.7537508686418137, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 10701 + }, + { + "epoch": 0.10702, + "grad_norm": 0.8835708055800392, + "learning_rate": 0.003, + "loss": 4.074, + "step": 10702 + }, + { + "epoch": 0.10703, + "grad_norm": 0.7817691460438141, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 10703 + }, + { + "epoch": 0.10704, + "grad_norm": 0.645477228585237, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 10704 + }, + { + "epoch": 0.10705, + "grad_norm": 0.7710367164781207, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 10705 + }, + { + "epoch": 0.10706, + "grad_norm": 0.7901517958326595, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 10706 + }, + { + "epoch": 0.10707, + "grad_norm": 0.7891516356491817, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 10707 + }, + { + "epoch": 0.10708, + "grad_norm": 0.8615348206451179, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 10708 + }, + { + "epoch": 0.10709, + "grad_norm": 0.9888041530740351, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 10709 + }, + { + "epoch": 0.1071, + "grad_norm": 1.061660380792365, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 10710 + }, + { + "epoch": 0.10711, + "grad_norm": 0.9066601556149967, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 10711 + }, + { + "epoch": 0.10712, + "grad_norm": 0.8116455998652893, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 10712 + }, + { + "epoch": 0.10713, + "grad_norm": 0.8637113563116897, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 10713 + }, + { + "epoch": 0.10714, + "grad_norm": 0.7022220024601558, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 10714 + }, + { + "epoch": 0.10715, + "grad_norm": 0.6293443796442022, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 10715 + }, + { + "epoch": 0.10716, + "grad_norm": 0.6779094422266427, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 10716 + }, + { + "epoch": 0.10717, + "grad_norm": 0.7211066849511437, + "learning_rate": 0.003, + "loss": 4.089, + "step": 10717 + }, + { + "epoch": 0.10718, + "grad_norm": 0.6577616772264899, + "learning_rate": 0.003, + "loss": 4.048, + "step": 10718 + }, + { + "epoch": 0.10719, + "grad_norm": 0.6728898224028206, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 10719 + }, + { + "epoch": 0.1072, + "grad_norm": 0.6504562644268949, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 10720 + }, + { + "epoch": 0.10721, + "grad_norm": 0.5906580401216541, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 10721 + }, + { + "epoch": 0.10722, + "grad_norm": 0.6651104603087701, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 10722 + }, + { + "epoch": 0.10723, + "grad_norm": 0.8274844100078965, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 10723 + }, + { + "epoch": 0.10724, + "grad_norm": 1.107508585414254, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 10724 + }, + { + "epoch": 0.10725, + "grad_norm": 0.9770130165439336, + "learning_rate": 0.003, + "loss": 4.055, + "step": 10725 + }, + { + "epoch": 0.10726, + "grad_norm": 0.8609099367579911, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 10726 + }, + { + "epoch": 0.10727, + "grad_norm": 0.729890468071586, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 10727 + }, + { + "epoch": 0.10728, + "grad_norm": 0.6948485160734639, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 10728 + }, + { + "epoch": 0.10729, + "grad_norm": 0.6678861533303945, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 10729 + }, + { + "epoch": 0.1073, + "grad_norm": 0.7248961043248474, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 10730 + }, + { + "epoch": 0.10731, + "grad_norm": 0.7071667241504125, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 10731 + }, + { + "epoch": 0.10732, + "grad_norm": 0.7205983714963741, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 10732 + }, + { + "epoch": 0.10733, + "grad_norm": 0.688369875669321, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 10733 + }, + { + "epoch": 0.10734, + "grad_norm": 0.6741612928630552, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 10734 + }, + { + "epoch": 0.10735, + "grad_norm": 0.7199477700388638, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 10735 + }, + { + "epoch": 0.10736, + "grad_norm": 0.7876731105567968, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 10736 + }, + { + "epoch": 0.10737, + "grad_norm": 0.8324327281824561, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 10737 + }, + { + "epoch": 0.10738, + "grad_norm": 0.8709320081981367, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 10738 + }, + { + "epoch": 0.10739, + "grad_norm": 0.9562955220392207, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 10739 + }, + { + "epoch": 0.1074, + "grad_norm": 1.2304307954831826, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 10740 + }, + { + "epoch": 0.10741, + "grad_norm": 0.9300435353142966, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 10741 + }, + { + "epoch": 0.10742, + "grad_norm": 0.9335382984724416, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 10742 + }, + { + "epoch": 0.10743, + "grad_norm": 0.9877653745311188, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 10743 + }, + { + "epoch": 0.10744, + "grad_norm": 1.0405104571146349, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 10744 + }, + { + "epoch": 0.10745, + "grad_norm": 0.8419311900278269, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 10745 + }, + { + "epoch": 0.10746, + "grad_norm": 0.8478057480297286, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 10746 + }, + { + "epoch": 0.10747, + "grad_norm": 0.9138973791244602, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 10747 + }, + { + "epoch": 0.10748, + "grad_norm": 0.9349681802837756, + "learning_rate": 0.003, + "loss": 4.1211, + "step": 10748 + }, + { + "epoch": 0.10749, + "grad_norm": 0.950015954392382, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 10749 + }, + { + "epoch": 0.1075, + "grad_norm": 0.8943074737592488, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 10750 + }, + { + "epoch": 0.10751, + "grad_norm": 0.8401604688025964, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 10751 + }, + { + "epoch": 0.10752, + "grad_norm": 0.8907490401952103, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 10752 + }, + { + "epoch": 0.10753, + "grad_norm": 0.8019963256041405, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 10753 + }, + { + "epoch": 0.10754, + "grad_norm": 0.7334278789249893, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 10754 + }, + { + "epoch": 0.10755, + "grad_norm": 0.7807073635179747, + "learning_rate": 0.003, + "loss": 4.1094, + "step": 10755 + }, + { + "epoch": 0.10756, + "grad_norm": 0.9037897305755073, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 10756 + }, + { + "epoch": 0.10757, + "grad_norm": 1.0038789915507889, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 10757 + }, + { + "epoch": 0.10758, + "grad_norm": 0.8637668717135266, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 10758 + }, + { + "epoch": 0.10759, + "grad_norm": 0.7784709868978823, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 10759 + }, + { + "epoch": 0.1076, + "grad_norm": 0.7513385059899371, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 10760 + }, + { + "epoch": 0.10761, + "grad_norm": 0.658145040039802, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 10761 + }, + { + "epoch": 0.10762, + "grad_norm": 0.6319284675105509, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 10762 + }, + { + "epoch": 0.10763, + "grad_norm": 0.6484314688691583, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 10763 + }, + { + "epoch": 0.10764, + "grad_norm": 0.6447554772230162, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 10764 + }, + { + "epoch": 0.10765, + "grad_norm": 0.6947856450689127, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 10765 + }, + { + "epoch": 0.10766, + "grad_norm": 0.8271087875534372, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 10766 + }, + { + "epoch": 0.10767, + "grad_norm": 0.9248495761280467, + "learning_rate": 0.003, + "loss": 4.083, + "step": 10767 + }, + { + "epoch": 0.10768, + "grad_norm": 0.906365044623054, + "learning_rate": 0.003, + "loss": 4.079, + "step": 10768 + }, + { + "epoch": 0.10769, + "grad_norm": 0.9421556590622695, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 10769 + }, + { + "epoch": 0.1077, + "grad_norm": 0.9830274427766104, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 10770 + }, + { + "epoch": 0.10771, + "grad_norm": 0.7459901139183797, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 10771 + }, + { + "epoch": 0.10772, + "grad_norm": 0.8029037552458691, + "learning_rate": 0.003, + "loss": 4.08, + "step": 10772 + }, + { + "epoch": 0.10773, + "grad_norm": 0.8413257874108826, + "learning_rate": 0.003, + "loss": 4.094, + "step": 10773 + }, + { + "epoch": 0.10774, + "grad_norm": 0.8939967598650286, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 10774 + }, + { + "epoch": 0.10775, + "grad_norm": 0.9500930304152452, + "learning_rate": 0.003, + "loss": 4.1063, + "step": 10775 + }, + { + "epoch": 0.10776, + "grad_norm": 0.9180361337301888, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 10776 + }, + { + "epoch": 0.10777, + "grad_norm": 0.8739106054853617, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 10777 + }, + { + "epoch": 0.10778, + "grad_norm": 0.7861751893427512, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 10778 + }, + { + "epoch": 0.10779, + "grad_norm": 0.6801288616685922, + "learning_rate": 0.003, + "loss": 4.058, + "step": 10779 + }, + { + "epoch": 0.1078, + "grad_norm": 0.6595454326458605, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 10780 + }, + { + "epoch": 0.10781, + "grad_norm": 0.677694591198545, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 10781 + }, + { + "epoch": 0.10782, + "grad_norm": 0.7322536223576103, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 10782 + }, + { + "epoch": 0.10783, + "grad_norm": 0.6949774146294931, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 10783 + }, + { + "epoch": 0.10784, + "grad_norm": 0.7112783123302691, + "learning_rate": 0.003, + "loss": 4.1034, + "step": 10784 + }, + { + "epoch": 0.10785, + "grad_norm": 0.7624840584818994, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 10785 + }, + { + "epoch": 0.10786, + "grad_norm": 0.8307593955140588, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 10786 + }, + { + "epoch": 0.10787, + "grad_norm": 0.8234800415780226, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 10787 + }, + { + "epoch": 0.10788, + "grad_norm": 0.7063471007386206, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 10788 + }, + { + "epoch": 0.10789, + "grad_norm": 0.7003812666554212, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 10789 + }, + { + "epoch": 0.1079, + "grad_norm": 0.591255796389519, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 10790 + }, + { + "epoch": 0.10791, + "grad_norm": 0.6588407077575648, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 10791 + }, + { + "epoch": 0.10792, + "grad_norm": 0.7812948240216615, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 10792 + }, + { + "epoch": 0.10793, + "grad_norm": 0.875484648604842, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 10793 + }, + { + "epoch": 0.10794, + "grad_norm": 1.0819813451984712, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 10794 + }, + { + "epoch": 0.10795, + "grad_norm": 1.0106647386412784, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 10795 + }, + { + "epoch": 0.10796, + "grad_norm": 0.8727394703935979, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 10796 + }, + { + "epoch": 0.10797, + "grad_norm": 0.7493976900488687, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 10797 + }, + { + "epoch": 0.10798, + "grad_norm": 0.6778670897020321, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 10798 + }, + { + "epoch": 0.10799, + "grad_norm": 0.6886630260315241, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 10799 + }, + { + "epoch": 0.108, + "grad_norm": 0.6997924479763118, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 10800 + }, + { + "epoch": 0.10801, + "grad_norm": 0.6335312641192151, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 10801 + }, + { + "epoch": 0.10802, + "grad_norm": 0.5596975143808999, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 10802 + }, + { + "epoch": 0.10803, + "grad_norm": 0.5883654642439371, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 10803 + }, + { + "epoch": 0.10804, + "grad_norm": 0.6353797720617529, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 10804 + }, + { + "epoch": 0.10805, + "grad_norm": 0.766636901293996, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 10805 + }, + { + "epoch": 0.10806, + "grad_norm": 0.9243626908434707, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 10806 + }, + { + "epoch": 0.10807, + "grad_norm": 0.9974198487878154, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 10807 + }, + { + "epoch": 0.10808, + "grad_norm": 0.906105158565713, + "learning_rate": 0.003, + "loss": 4.1665, + "step": 10808 + }, + { + "epoch": 0.10809, + "grad_norm": 0.9969514627729021, + "learning_rate": 0.003, + "loss": 4.097, + "step": 10809 + }, + { + "epoch": 0.1081, + "grad_norm": 0.834425050345777, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 10810 + }, + { + "epoch": 0.10811, + "grad_norm": 0.8783993703886297, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 10811 + }, + { + "epoch": 0.10812, + "grad_norm": 1.0486220426031836, + "learning_rate": 0.003, + "loss": 4.069, + "step": 10812 + }, + { + "epoch": 0.10813, + "grad_norm": 0.9635854562205466, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 10813 + }, + { + "epoch": 0.10814, + "grad_norm": 0.991913843257712, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 10814 + }, + { + "epoch": 0.10815, + "grad_norm": 1.03240484345877, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 10815 + }, + { + "epoch": 0.10816, + "grad_norm": 0.7999243463833863, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 10816 + }, + { + "epoch": 0.10817, + "grad_norm": 0.7146118966430277, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 10817 + }, + { + "epoch": 0.10818, + "grad_norm": 0.6760564583671121, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 10818 + }, + { + "epoch": 0.10819, + "grad_norm": 0.7437787761517194, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 10819 + }, + { + "epoch": 0.1082, + "grad_norm": 0.7667318805568926, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 10820 + }, + { + "epoch": 0.10821, + "grad_norm": 0.7578063636892496, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 10821 + }, + { + "epoch": 0.10822, + "grad_norm": 0.9722299248990864, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 10822 + }, + { + "epoch": 0.10823, + "grad_norm": 1.084422938439403, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 10823 + }, + { + "epoch": 0.10824, + "grad_norm": 0.7866493459633321, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 10824 + }, + { + "epoch": 0.10825, + "grad_norm": 0.6928064188763265, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 10825 + }, + { + "epoch": 0.10826, + "grad_norm": 0.7316035379733721, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 10826 + }, + { + "epoch": 0.10827, + "grad_norm": 0.7872692879605871, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 10827 + }, + { + "epoch": 0.10828, + "grad_norm": 0.6915375778555326, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 10828 + }, + { + "epoch": 0.10829, + "grad_norm": 0.748471989224979, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 10829 + }, + { + "epoch": 0.1083, + "grad_norm": 0.763563153780263, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 10830 + }, + { + "epoch": 0.10831, + "grad_norm": 0.7557051035736269, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 10831 + }, + { + "epoch": 0.10832, + "grad_norm": 0.8081559760247755, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 10832 + }, + { + "epoch": 0.10833, + "grad_norm": 0.7916475572460021, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 10833 + }, + { + "epoch": 0.10834, + "grad_norm": 0.7180126556558653, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 10834 + }, + { + "epoch": 0.10835, + "grad_norm": 0.6897522880798621, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 10835 + }, + { + "epoch": 0.10836, + "grad_norm": 0.720390447907444, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 10836 + }, + { + "epoch": 0.10837, + "grad_norm": 0.7729119904062143, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 10837 + }, + { + "epoch": 0.10838, + "grad_norm": 0.974189110491675, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 10838 + }, + { + "epoch": 0.10839, + "grad_norm": 1.1146789421849448, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 10839 + }, + { + "epoch": 0.1084, + "grad_norm": 0.7863144186085149, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 10840 + }, + { + "epoch": 0.10841, + "grad_norm": 0.6344950133109905, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 10841 + }, + { + "epoch": 0.10842, + "grad_norm": 0.6253130642537409, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 10842 + }, + { + "epoch": 0.10843, + "grad_norm": 0.642829186840075, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 10843 + }, + { + "epoch": 0.10844, + "grad_norm": 0.7172865995614057, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 10844 + }, + { + "epoch": 0.10845, + "grad_norm": 0.8387230458337437, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 10845 + }, + { + "epoch": 0.10846, + "grad_norm": 0.8272335404876863, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 10846 + }, + { + "epoch": 0.10847, + "grad_norm": 0.8339723408210796, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 10847 + }, + { + "epoch": 0.10848, + "grad_norm": 0.6960991280713948, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 10848 + }, + { + "epoch": 0.10849, + "grad_norm": 0.621407103825329, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 10849 + }, + { + "epoch": 0.1085, + "grad_norm": 0.6226763190146141, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 10850 + }, + { + "epoch": 0.10851, + "grad_norm": 0.7346253480020578, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 10851 + }, + { + "epoch": 0.10852, + "grad_norm": 0.8514943147557615, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 10852 + }, + { + "epoch": 0.10853, + "grad_norm": 0.9517437965287522, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 10853 + }, + { + "epoch": 0.10854, + "grad_norm": 0.9508741169410276, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 10854 + }, + { + "epoch": 0.10855, + "grad_norm": 0.7768027675885154, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 10855 + }, + { + "epoch": 0.10856, + "grad_norm": 0.7413209828275256, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 10856 + }, + { + "epoch": 0.10857, + "grad_norm": 0.7185402434742164, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 10857 + }, + { + "epoch": 0.10858, + "grad_norm": 0.6826430053195711, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 10858 + }, + { + "epoch": 0.10859, + "grad_norm": 0.761533780013182, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 10859 + }, + { + "epoch": 0.1086, + "grad_norm": 0.7642804260240755, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 10860 + }, + { + "epoch": 0.10861, + "grad_norm": 0.825548626289074, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 10861 + }, + { + "epoch": 0.10862, + "grad_norm": 0.9037697230091956, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 10862 + }, + { + "epoch": 0.10863, + "grad_norm": 0.8657754903618303, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 10863 + }, + { + "epoch": 0.10864, + "grad_norm": 0.8141322733115343, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 10864 + }, + { + "epoch": 0.10865, + "grad_norm": 0.7253469991497675, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 10865 + }, + { + "epoch": 0.10866, + "grad_norm": 0.7117167359362205, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 10866 + }, + { + "epoch": 0.10867, + "grad_norm": 0.7882818390575478, + "learning_rate": 0.003, + "loss": 4.057, + "step": 10867 + }, + { + "epoch": 0.10868, + "grad_norm": 0.9220190688619744, + "learning_rate": 0.003, + "loss": 4.058, + "step": 10868 + }, + { + "epoch": 0.10869, + "grad_norm": 1.2171062146891194, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 10869 + }, + { + "epoch": 0.1087, + "grad_norm": 0.9409103742766868, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 10870 + }, + { + "epoch": 0.10871, + "grad_norm": 0.7857575782459725, + "learning_rate": 0.003, + "loss": 4.067, + "step": 10871 + }, + { + "epoch": 0.10872, + "grad_norm": 0.7008071897173932, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 10872 + }, + { + "epoch": 0.10873, + "grad_norm": 0.6493494671071487, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 10873 + }, + { + "epoch": 0.10874, + "grad_norm": 0.7422045080755252, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 10874 + }, + { + "epoch": 0.10875, + "grad_norm": 0.7813850020221002, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 10875 + }, + { + "epoch": 0.10876, + "grad_norm": 0.7337134215205611, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 10876 + }, + { + "epoch": 0.10877, + "grad_norm": 0.5738251794128827, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 10877 + }, + { + "epoch": 0.10878, + "grad_norm": 0.5971880485148056, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 10878 + }, + { + "epoch": 0.10879, + "grad_norm": 0.6407269342613244, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 10879 + }, + { + "epoch": 0.1088, + "grad_norm": 0.7567936302504148, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 10880 + }, + { + "epoch": 0.10881, + "grad_norm": 0.8952438404265414, + "learning_rate": 0.003, + "loss": 4.053, + "step": 10881 + }, + { + "epoch": 0.10882, + "grad_norm": 1.016869262938487, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 10882 + }, + { + "epoch": 0.10883, + "grad_norm": 0.9089472739873565, + "learning_rate": 0.003, + "loss": 4.097, + "step": 10883 + }, + { + "epoch": 0.10884, + "grad_norm": 0.8192132716200892, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 10884 + }, + { + "epoch": 0.10885, + "grad_norm": 0.754863660292805, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 10885 + }, + { + "epoch": 0.10886, + "grad_norm": 0.7646838940972092, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 10886 + }, + { + "epoch": 0.10887, + "grad_norm": 0.8054792718032056, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 10887 + }, + { + "epoch": 0.10888, + "grad_norm": 0.807217157369814, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 10888 + }, + { + "epoch": 0.10889, + "grad_norm": 0.9596711545065643, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 10889 + }, + { + "epoch": 0.1089, + "grad_norm": 1.0281488211494914, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 10890 + }, + { + "epoch": 0.10891, + "grad_norm": 1.008425907314204, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 10891 + }, + { + "epoch": 0.10892, + "grad_norm": 0.9881878897319983, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 10892 + }, + { + "epoch": 0.10893, + "grad_norm": 0.9651702578236134, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 10893 + }, + { + "epoch": 0.10894, + "grad_norm": 0.91488242972445, + "learning_rate": 0.003, + "loss": 4.082, + "step": 10894 + }, + { + "epoch": 0.10895, + "grad_norm": 0.820190985605287, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 10895 + }, + { + "epoch": 0.10896, + "grad_norm": 0.8047362457919742, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 10896 + }, + { + "epoch": 0.10897, + "grad_norm": 0.8272579145202782, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 10897 + }, + { + "epoch": 0.10898, + "grad_norm": 0.8529516426412708, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 10898 + }, + { + "epoch": 0.10899, + "grad_norm": 0.792496666395517, + "learning_rate": 0.003, + "loss": 4.079, + "step": 10899 + }, + { + "epoch": 0.109, + "grad_norm": 0.938657528442336, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 10900 + }, + { + "epoch": 0.10901, + "grad_norm": 1.2363502284270567, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 10901 + }, + { + "epoch": 0.10902, + "grad_norm": 0.7903678185797423, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 10902 + }, + { + "epoch": 0.10903, + "grad_norm": 0.6789417264937385, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 10903 + }, + { + "epoch": 0.10904, + "grad_norm": 0.7466995321912779, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 10904 + }, + { + "epoch": 0.10905, + "grad_norm": 0.8446333893055047, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 10905 + }, + { + "epoch": 0.10906, + "grad_norm": 0.9434146945884782, + "learning_rate": 0.003, + "loss": 4.113, + "step": 10906 + }, + { + "epoch": 0.10907, + "grad_norm": 0.9448346223448993, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 10907 + }, + { + "epoch": 0.10908, + "grad_norm": 0.9492929430380965, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 10908 + }, + { + "epoch": 0.10909, + "grad_norm": 0.9638647144203306, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 10909 + }, + { + "epoch": 0.1091, + "grad_norm": 0.8114844498739628, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 10910 + }, + { + "epoch": 0.10911, + "grad_norm": 0.7861873196944332, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 10911 + }, + { + "epoch": 0.10912, + "grad_norm": 0.6992439758399313, + "learning_rate": 0.003, + "loss": 4.083, + "step": 10912 + }, + { + "epoch": 0.10913, + "grad_norm": 0.6815651781157617, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 10913 + }, + { + "epoch": 0.10914, + "grad_norm": 0.6536113835814653, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 10914 + }, + { + "epoch": 0.10915, + "grad_norm": 0.6938082032494179, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 10915 + }, + { + "epoch": 0.10916, + "grad_norm": 0.7202674599350765, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 10916 + }, + { + "epoch": 0.10917, + "grad_norm": 0.7491490948689121, + "learning_rate": 0.003, + "loss": 4.093, + "step": 10917 + }, + { + "epoch": 0.10918, + "grad_norm": 1.11036055240353, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 10918 + }, + { + "epoch": 0.10919, + "grad_norm": 1.1337221606450771, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 10919 + }, + { + "epoch": 0.1092, + "grad_norm": 0.8428547311068283, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 10920 + }, + { + "epoch": 0.10921, + "grad_norm": 0.6082082658833271, + "learning_rate": 0.003, + "loss": 4.091, + "step": 10921 + }, + { + "epoch": 0.10922, + "grad_norm": 0.6315066804935527, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 10922 + }, + { + "epoch": 0.10923, + "grad_norm": 0.6061406163799906, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 10923 + }, + { + "epoch": 0.10924, + "grad_norm": 0.6369014987608396, + "learning_rate": 0.003, + "loss": 4.052, + "step": 10924 + }, + { + "epoch": 0.10925, + "grad_norm": 0.6509900001681895, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 10925 + }, + { + "epoch": 0.10926, + "grad_norm": 0.5911101588927634, + "learning_rate": 0.003, + "loss": 4.037, + "step": 10926 + }, + { + "epoch": 0.10927, + "grad_norm": 0.6090199769351918, + "learning_rate": 0.003, + "loss": 4.081, + "step": 10927 + }, + { + "epoch": 0.10928, + "grad_norm": 0.7583916555674786, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 10928 + }, + { + "epoch": 0.10929, + "grad_norm": 0.9306843772780642, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 10929 + }, + { + "epoch": 0.1093, + "grad_norm": 1.2362967491595924, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 10930 + }, + { + "epoch": 0.10931, + "grad_norm": 0.6502246716231995, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 10931 + }, + { + "epoch": 0.10932, + "grad_norm": 0.5470588873212986, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 10932 + }, + { + "epoch": 0.10933, + "grad_norm": 0.6038156560331971, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 10933 + }, + { + "epoch": 0.10934, + "grad_norm": 0.631710911079846, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 10934 + }, + { + "epoch": 0.10935, + "grad_norm": 0.6072788054734551, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 10935 + }, + { + "epoch": 0.10936, + "grad_norm": 0.5548572160813884, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 10936 + }, + { + "epoch": 0.10937, + "grad_norm": 0.5678083242728192, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 10937 + }, + { + "epoch": 0.10938, + "grad_norm": 0.5349364110178259, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 10938 + }, + { + "epoch": 0.10939, + "grad_norm": 0.5314674417062363, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 10939 + }, + { + "epoch": 0.1094, + "grad_norm": 0.5567007468861874, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 10940 + }, + { + "epoch": 0.10941, + "grad_norm": 0.6095491373372913, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 10941 + }, + { + "epoch": 0.10942, + "grad_norm": 0.7298043409485729, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 10942 + }, + { + "epoch": 0.10943, + "grad_norm": 0.786513510107108, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 10943 + }, + { + "epoch": 0.10944, + "grad_norm": 0.7774007804681506, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 10944 + }, + { + "epoch": 0.10945, + "grad_norm": 0.7495722469210125, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 10945 + }, + { + "epoch": 0.10946, + "grad_norm": 0.8064706771959848, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 10946 + }, + { + "epoch": 0.10947, + "grad_norm": 0.9393433262565961, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 10947 + }, + { + "epoch": 0.10948, + "grad_norm": 0.9387790046571931, + "learning_rate": 0.003, + "loss": 4.074, + "step": 10948 + }, + { + "epoch": 0.10949, + "grad_norm": 1.1410032379592778, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 10949 + }, + { + "epoch": 0.1095, + "grad_norm": 0.9129552961385187, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 10950 + }, + { + "epoch": 0.10951, + "grad_norm": 1.0435796410645963, + "learning_rate": 0.003, + "loss": 4.075, + "step": 10951 + }, + { + "epoch": 0.10952, + "grad_norm": 1.3358604841725472, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 10952 + }, + { + "epoch": 0.10953, + "grad_norm": 0.8310108373581879, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 10953 + }, + { + "epoch": 0.10954, + "grad_norm": 0.6723858624058059, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 10954 + }, + { + "epoch": 0.10955, + "grad_norm": 0.6611226964259507, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 10955 + }, + { + "epoch": 0.10956, + "grad_norm": 0.7011303501399112, + "learning_rate": 0.003, + "loss": 4.1017, + "step": 10956 + }, + { + "epoch": 0.10957, + "grad_norm": 0.6836216086535878, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 10957 + }, + { + "epoch": 0.10958, + "grad_norm": 0.7592475674795014, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 10958 + }, + { + "epoch": 0.10959, + "grad_norm": 0.7522399573457461, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 10959 + }, + { + "epoch": 0.1096, + "grad_norm": 0.7583206213419711, + "learning_rate": 0.003, + "loss": 4.058, + "step": 10960 + }, + { + "epoch": 0.10961, + "grad_norm": 0.7948798372620559, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 10961 + }, + { + "epoch": 0.10962, + "grad_norm": 0.9520999820511873, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 10962 + }, + { + "epoch": 0.10963, + "grad_norm": 0.9934263182752433, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 10963 + }, + { + "epoch": 0.10964, + "grad_norm": 1.0404312099324224, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 10964 + }, + { + "epoch": 0.10965, + "grad_norm": 0.9314463760092233, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 10965 + }, + { + "epoch": 0.10966, + "grad_norm": 0.8350231443250099, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 10966 + }, + { + "epoch": 0.10967, + "grad_norm": 0.9320300124939774, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 10967 + }, + { + "epoch": 0.10968, + "grad_norm": 0.9215579233627603, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 10968 + }, + { + "epoch": 0.10969, + "grad_norm": 1.0604243559948778, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 10969 + }, + { + "epoch": 0.1097, + "grad_norm": 1.1138978634435108, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 10970 + }, + { + "epoch": 0.10971, + "grad_norm": 0.8951876392104671, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 10971 + }, + { + "epoch": 0.10972, + "grad_norm": 0.9676653328126739, + "learning_rate": 0.003, + "loss": 4.1456, + "step": 10972 + }, + { + "epoch": 0.10973, + "grad_norm": 1.0352834124023993, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 10973 + }, + { + "epoch": 0.10974, + "grad_norm": 1.0675730058873871, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 10974 + }, + { + "epoch": 0.10975, + "grad_norm": 0.9740004559762198, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 10975 + }, + { + "epoch": 0.10976, + "grad_norm": 1.0447644987824045, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 10976 + }, + { + "epoch": 0.10977, + "grad_norm": 1.0200289761418742, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 10977 + }, + { + "epoch": 0.10978, + "grad_norm": 0.9365717545240859, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 10978 + }, + { + "epoch": 0.10979, + "grad_norm": 0.8362679752586221, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 10979 + }, + { + "epoch": 0.1098, + "grad_norm": 0.7172618437426157, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 10980 + }, + { + "epoch": 0.10981, + "grad_norm": 0.6864193346218213, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 10981 + }, + { + "epoch": 0.10982, + "grad_norm": 0.5895896832601729, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 10982 + }, + { + "epoch": 0.10983, + "grad_norm": 0.5787111289779593, + "learning_rate": 0.003, + "loss": 4.064, + "step": 10983 + }, + { + "epoch": 0.10984, + "grad_norm": 0.6057523165429262, + "learning_rate": 0.003, + "loss": 4.106, + "step": 10984 + }, + { + "epoch": 0.10985, + "grad_norm": 0.6514886745623131, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 10985 + }, + { + "epoch": 0.10986, + "grad_norm": 0.7435079048577373, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 10986 + }, + { + "epoch": 0.10987, + "grad_norm": 0.9200597665780822, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 10987 + }, + { + "epoch": 0.10988, + "grad_norm": 1.1265238934764898, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 10988 + }, + { + "epoch": 0.10989, + "grad_norm": 0.88644921715438, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 10989 + }, + { + "epoch": 0.1099, + "grad_norm": 0.8325133772180551, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 10990 + }, + { + "epoch": 0.10991, + "grad_norm": 0.8420468014984893, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 10991 + }, + { + "epoch": 0.10992, + "grad_norm": 0.8537316831757594, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 10992 + }, + { + "epoch": 0.10993, + "grad_norm": 0.7940387723272442, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 10993 + }, + { + "epoch": 0.10994, + "grad_norm": 0.6710457088949344, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 10994 + }, + { + "epoch": 0.10995, + "grad_norm": 0.6142598287538222, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 10995 + }, + { + "epoch": 0.10996, + "grad_norm": 0.6660630848750299, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 10996 + }, + { + "epoch": 0.10997, + "grad_norm": 0.7899825737081104, + "learning_rate": 0.003, + "loss": 4.1131, + "step": 10997 + }, + { + "epoch": 0.10998, + "grad_norm": 0.8923751751449065, + "learning_rate": 0.003, + "loss": 4.12, + "step": 10998 + }, + { + "epoch": 0.10999, + "grad_norm": 0.9402386393787387, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 10999 + }, + { + "epoch": 0.11, + "grad_norm": 0.9050561963603275, + "learning_rate": 0.003, + "loss": 4.1182, + "step": 11000 + }, + { + "epoch": 0.11001, + "grad_norm": 0.885324393665634, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 11001 + }, + { + "epoch": 0.11002, + "grad_norm": 0.9434080532034825, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 11002 + }, + { + "epoch": 0.11003, + "grad_norm": 1.010969350024946, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 11003 + }, + { + "epoch": 0.11004, + "grad_norm": 0.954640988869078, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 11004 + }, + { + "epoch": 0.11005, + "grad_norm": 0.8240164005546453, + "learning_rate": 0.003, + "loss": 4.064, + "step": 11005 + }, + { + "epoch": 0.11006, + "grad_norm": 0.7594825840470841, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 11006 + }, + { + "epoch": 0.11007, + "grad_norm": 0.7590475204659565, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 11007 + }, + { + "epoch": 0.11008, + "grad_norm": 0.7958304222714225, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 11008 + }, + { + "epoch": 0.11009, + "grad_norm": 0.8287995764159665, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 11009 + }, + { + "epoch": 0.1101, + "grad_norm": 0.9174224956026463, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 11010 + }, + { + "epoch": 0.11011, + "grad_norm": 0.9810955939848184, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 11011 + }, + { + "epoch": 0.11012, + "grad_norm": 0.9588660180330635, + "learning_rate": 0.003, + "loss": 4.082, + "step": 11012 + }, + { + "epoch": 0.11013, + "grad_norm": 0.818062590515428, + "learning_rate": 0.003, + "loss": 4.096, + "step": 11013 + }, + { + "epoch": 0.11014, + "grad_norm": 0.6982672473244061, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 11014 + }, + { + "epoch": 0.11015, + "grad_norm": 0.7514408359654094, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 11015 + }, + { + "epoch": 0.11016, + "grad_norm": 0.7207043759222721, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 11016 + }, + { + "epoch": 0.11017, + "grad_norm": 0.6165799587110872, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 11017 + }, + { + "epoch": 0.11018, + "grad_norm": 0.6810053095086784, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 11018 + }, + { + "epoch": 0.11019, + "grad_norm": 0.6851205152192956, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 11019 + }, + { + "epoch": 0.1102, + "grad_norm": 0.7128614695443434, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 11020 + }, + { + "epoch": 0.11021, + "grad_norm": 0.7954986146548706, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 11021 + }, + { + "epoch": 0.11022, + "grad_norm": 0.9822662453480975, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 11022 + }, + { + "epoch": 0.11023, + "grad_norm": 0.992584945136861, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 11023 + }, + { + "epoch": 0.11024, + "grad_norm": 0.9429960341025528, + "learning_rate": 0.003, + "loss": 4.11, + "step": 11024 + }, + { + "epoch": 0.11025, + "grad_norm": 0.8325519591187426, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 11025 + }, + { + "epoch": 0.11026, + "grad_norm": 0.8109579572209392, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 11026 + }, + { + "epoch": 0.11027, + "grad_norm": 0.8503578340284045, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 11027 + }, + { + "epoch": 0.11028, + "grad_norm": 0.7623548658665757, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 11028 + }, + { + "epoch": 0.11029, + "grad_norm": 0.6574600909508291, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 11029 + }, + { + "epoch": 0.1103, + "grad_norm": 0.6153319028159655, + "learning_rate": 0.003, + "loss": 4.082, + "step": 11030 + }, + { + "epoch": 0.11031, + "grad_norm": 0.6706428602781416, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 11031 + }, + { + "epoch": 0.11032, + "grad_norm": 0.7446375104894701, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 11032 + }, + { + "epoch": 0.11033, + "grad_norm": 0.8287999931870835, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 11033 + }, + { + "epoch": 0.11034, + "grad_norm": 0.9769780850481814, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 11034 + }, + { + "epoch": 0.11035, + "grad_norm": 1.0533298010822936, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 11035 + }, + { + "epoch": 0.11036, + "grad_norm": 0.829555188618686, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 11036 + }, + { + "epoch": 0.11037, + "grad_norm": 0.6487286656984719, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 11037 + }, + { + "epoch": 0.11038, + "grad_norm": 0.6380967062306625, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 11038 + }, + { + "epoch": 0.11039, + "grad_norm": 0.7177361539966476, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 11039 + }, + { + "epoch": 0.1104, + "grad_norm": 0.6958535348954583, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 11040 + }, + { + "epoch": 0.11041, + "grad_norm": 0.5940027764338971, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 11041 + }, + { + "epoch": 0.11042, + "grad_norm": 0.6049326361318368, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 11042 + }, + { + "epoch": 0.11043, + "grad_norm": 0.629245191369106, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 11043 + }, + { + "epoch": 0.11044, + "grad_norm": 0.5444625180097691, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 11044 + }, + { + "epoch": 0.11045, + "grad_norm": 0.5617706070761912, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 11045 + }, + { + "epoch": 0.11046, + "grad_norm": 0.5611245309640837, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 11046 + }, + { + "epoch": 0.11047, + "grad_norm": 0.5187354432594959, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 11047 + }, + { + "epoch": 0.11048, + "grad_norm": 0.5788524265693503, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 11048 + }, + { + "epoch": 0.11049, + "grad_norm": 0.7277778292414144, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 11049 + }, + { + "epoch": 0.1105, + "grad_norm": 0.9135531216313109, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 11050 + }, + { + "epoch": 0.11051, + "grad_norm": 1.0056469057417077, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 11051 + }, + { + "epoch": 0.11052, + "grad_norm": 0.9280627702680145, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 11052 + }, + { + "epoch": 0.11053, + "grad_norm": 0.898418645317437, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 11053 + }, + { + "epoch": 0.11054, + "grad_norm": 0.9058851919381276, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 11054 + }, + { + "epoch": 0.11055, + "grad_norm": 0.8116406755293073, + "learning_rate": 0.003, + "loss": 4.081, + "step": 11055 + }, + { + "epoch": 0.11056, + "grad_norm": 0.8161785790884862, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 11056 + }, + { + "epoch": 0.11057, + "grad_norm": 0.867405007787928, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 11057 + }, + { + "epoch": 0.11058, + "grad_norm": 0.9816412480496788, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 11058 + }, + { + "epoch": 0.11059, + "grad_norm": 1.1515701005249115, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 11059 + }, + { + "epoch": 0.1106, + "grad_norm": 0.854982835118098, + "learning_rate": 0.003, + "loss": 4.093, + "step": 11060 + }, + { + "epoch": 0.11061, + "grad_norm": 0.8438666981265432, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 11061 + }, + { + "epoch": 0.11062, + "grad_norm": 0.9551886092704508, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 11062 + }, + { + "epoch": 0.11063, + "grad_norm": 0.933860815146107, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 11063 + }, + { + "epoch": 0.11064, + "grad_norm": 0.7776075181114828, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 11064 + }, + { + "epoch": 0.11065, + "grad_norm": 0.7850056440416847, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 11065 + }, + { + "epoch": 0.11066, + "grad_norm": 0.659664837771907, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 11066 + }, + { + "epoch": 0.11067, + "grad_norm": 0.6593281025138392, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 11067 + }, + { + "epoch": 0.11068, + "grad_norm": 0.6687637937193209, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 11068 + }, + { + "epoch": 0.11069, + "grad_norm": 0.7001621764244782, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 11069 + }, + { + "epoch": 0.1107, + "grad_norm": 0.7262171290177102, + "learning_rate": 0.003, + "loss": 4.086, + "step": 11070 + }, + { + "epoch": 0.11071, + "grad_norm": 0.800625243464924, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 11071 + }, + { + "epoch": 0.11072, + "grad_norm": 1.0052207819407468, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 11072 + }, + { + "epoch": 0.11073, + "grad_norm": 1.001613976466175, + "learning_rate": 0.003, + "loss": 4.1241, + "step": 11073 + }, + { + "epoch": 0.11074, + "grad_norm": 0.9625258224419659, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 11074 + }, + { + "epoch": 0.11075, + "grad_norm": 1.1074935298853081, + "learning_rate": 0.003, + "loss": 4.1152, + "step": 11075 + }, + { + "epoch": 0.11076, + "grad_norm": 1.0561029884393052, + "learning_rate": 0.003, + "loss": 4.09, + "step": 11076 + }, + { + "epoch": 0.11077, + "grad_norm": 0.9285331088718336, + "learning_rate": 0.003, + "loss": 4.139, + "step": 11077 + }, + { + "epoch": 0.11078, + "grad_norm": 0.808604672012496, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 11078 + }, + { + "epoch": 0.11079, + "grad_norm": 0.7884733933274555, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 11079 + }, + { + "epoch": 0.1108, + "grad_norm": 0.8357394406437376, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 11080 + }, + { + "epoch": 0.11081, + "grad_norm": 0.8803590252787716, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 11081 + }, + { + "epoch": 0.11082, + "grad_norm": 0.9368453183007583, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 11082 + }, + { + "epoch": 0.11083, + "grad_norm": 0.9299308351951199, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 11083 + }, + { + "epoch": 0.11084, + "grad_norm": 0.870455859227154, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 11084 + }, + { + "epoch": 0.11085, + "grad_norm": 0.7216008943105169, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 11085 + }, + { + "epoch": 0.11086, + "grad_norm": 0.6948711660910164, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 11086 + }, + { + "epoch": 0.11087, + "grad_norm": 0.6945540607673151, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 11087 + }, + { + "epoch": 0.11088, + "grad_norm": 0.638563248131485, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 11088 + }, + { + "epoch": 0.11089, + "grad_norm": 0.6341263639431465, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 11089 + }, + { + "epoch": 0.1109, + "grad_norm": 0.6174699458323581, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 11090 + }, + { + "epoch": 0.11091, + "grad_norm": 0.6103462124790471, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 11091 + }, + { + "epoch": 0.11092, + "grad_norm": 0.5651746008201992, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 11092 + }, + { + "epoch": 0.11093, + "grad_norm": 0.5601775914620788, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 11093 + }, + { + "epoch": 0.11094, + "grad_norm": 0.5516687114517382, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 11094 + }, + { + "epoch": 0.11095, + "grad_norm": 0.5876552445906201, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 11095 + }, + { + "epoch": 0.11096, + "grad_norm": 0.6849229580947964, + "learning_rate": 0.003, + "loss": 4.082, + "step": 11096 + }, + { + "epoch": 0.11097, + "grad_norm": 0.90306848388154, + "learning_rate": 0.003, + "loss": 4.092, + "step": 11097 + }, + { + "epoch": 0.11098, + "grad_norm": 1.0965563470179573, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 11098 + }, + { + "epoch": 0.11099, + "grad_norm": 0.7817470235674453, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 11099 + }, + { + "epoch": 0.111, + "grad_norm": 0.5655622318949944, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 11100 + }, + { + "epoch": 0.11101, + "grad_norm": 0.6798505146702352, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 11101 + }, + { + "epoch": 0.11102, + "grad_norm": 0.8259776028595571, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 11102 + }, + { + "epoch": 0.11103, + "grad_norm": 0.8961065505995598, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 11103 + }, + { + "epoch": 0.11104, + "grad_norm": 0.8356772914510698, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 11104 + }, + { + "epoch": 0.11105, + "grad_norm": 0.8829032009312003, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 11105 + }, + { + "epoch": 0.11106, + "grad_norm": 0.9776127333423626, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 11106 + }, + { + "epoch": 0.11107, + "grad_norm": 0.9884899110891495, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 11107 + }, + { + "epoch": 0.11108, + "grad_norm": 0.9083186955420474, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 11108 + }, + { + "epoch": 0.11109, + "grad_norm": 0.9905629247807923, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 11109 + }, + { + "epoch": 0.1111, + "grad_norm": 1.1256324486484663, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 11110 + }, + { + "epoch": 0.11111, + "grad_norm": 0.7276573929050033, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 11111 + }, + { + "epoch": 0.11112, + "grad_norm": 0.7229532212478316, + "learning_rate": 0.003, + "loss": 4.11, + "step": 11112 + }, + { + "epoch": 0.11113, + "grad_norm": 0.7357412365818277, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 11113 + }, + { + "epoch": 0.11114, + "grad_norm": 0.6686360455122295, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 11114 + }, + { + "epoch": 0.11115, + "grad_norm": 0.7741764825670838, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 11115 + }, + { + "epoch": 0.11116, + "grad_norm": 1.089654929098186, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 11116 + }, + { + "epoch": 0.11117, + "grad_norm": 1.003149957043394, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 11117 + }, + { + "epoch": 0.11118, + "grad_norm": 0.7553134094623033, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 11118 + }, + { + "epoch": 0.11119, + "grad_norm": 0.582610597098817, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 11119 + }, + { + "epoch": 0.1112, + "grad_norm": 0.6612281295879227, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 11120 + }, + { + "epoch": 0.11121, + "grad_norm": 0.8000233847001187, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 11121 + }, + { + "epoch": 0.11122, + "grad_norm": 0.8404674674939324, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 11122 + }, + { + "epoch": 0.11123, + "grad_norm": 0.979990963536137, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 11123 + }, + { + "epoch": 0.11124, + "grad_norm": 1.1400318707447292, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 11124 + }, + { + "epoch": 0.11125, + "grad_norm": 0.9186690040538052, + "learning_rate": 0.003, + "loss": 4.1055, + "step": 11125 + }, + { + "epoch": 0.11126, + "grad_norm": 1.0404524691016466, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 11126 + }, + { + "epoch": 0.11127, + "grad_norm": 1.0107939634734449, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 11127 + }, + { + "epoch": 0.11128, + "grad_norm": 0.9893896552321982, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 11128 + }, + { + "epoch": 0.11129, + "grad_norm": 0.8167246210083996, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 11129 + }, + { + "epoch": 0.1113, + "grad_norm": 0.7190600151740959, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 11130 + }, + { + "epoch": 0.11131, + "grad_norm": 0.6945641839293223, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 11131 + }, + { + "epoch": 0.11132, + "grad_norm": 0.828200398218642, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 11132 + }, + { + "epoch": 0.11133, + "grad_norm": 0.7873960152046607, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 11133 + }, + { + "epoch": 0.11134, + "grad_norm": 0.759110078625811, + "learning_rate": 0.003, + "loss": 4.064, + "step": 11134 + }, + { + "epoch": 0.11135, + "grad_norm": 0.7169621318467949, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 11135 + }, + { + "epoch": 0.11136, + "grad_norm": 0.645979991251931, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 11136 + }, + { + "epoch": 0.11137, + "grad_norm": 0.68607581660032, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 11137 + }, + { + "epoch": 0.11138, + "grad_norm": 0.785577460381717, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 11138 + }, + { + "epoch": 0.11139, + "grad_norm": 0.8364211665979386, + "learning_rate": 0.003, + "loss": 4.106, + "step": 11139 + }, + { + "epoch": 0.1114, + "grad_norm": 0.7642698793727908, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 11140 + }, + { + "epoch": 0.11141, + "grad_norm": 0.713957137186134, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 11141 + }, + { + "epoch": 0.11142, + "grad_norm": 0.8090643725446109, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 11142 + }, + { + "epoch": 0.11143, + "grad_norm": 0.7445012677708782, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 11143 + }, + { + "epoch": 0.11144, + "grad_norm": 0.7713246383144959, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 11144 + }, + { + "epoch": 0.11145, + "grad_norm": 0.9585554430132573, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 11145 + }, + { + "epoch": 0.11146, + "grad_norm": 1.2437802015715147, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 11146 + }, + { + "epoch": 0.11147, + "grad_norm": 0.798064848892936, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 11147 + }, + { + "epoch": 0.11148, + "grad_norm": 0.6344607558816502, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 11148 + }, + { + "epoch": 0.11149, + "grad_norm": 0.5963078934623987, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 11149 + }, + { + "epoch": 0.1115, + "grad_norm": 0.7150086302385679, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 11150 + }, + { + "epoch": 0.11151, + "grad_norm": 0.7228254346279399, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 11151 + }, + { + "epoch": 0.11152, + "grad_norm": 0.7880303348317202, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 11152 + }, + { + "epoch": 0.11153, + "grad_norm": 0.810534284842259, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 11153 + }, + { + "epoch": 0.11154, + "grad_norm": 0.8074464292942731, + "learning_rate": 0.003, + "loss": 4.092, + "step": 11154 + }, + { + "epoch": 0.11155, + "grad_norm": 0.8382989830252079, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 11155 + }, + { + "epoch": 0.11156, + "grad_norm": 0.875789436517272, + "learning_rate": 0.003, + "loss": 4.094, + "step": 11156 + }, + { + "epoch": 0.11157, + "grad_norm": 0.9604344902781272, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 11157 + }, + { + "epoch": 0.11158, + "grad_norm": 0.7338788485272214, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 11158 + }, + { + "epoch": 0.11159, + "grad_norm": 0.7075825561394384, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 11159 + }, + { + "epoch": 0.1116, + "grad_norm": 0.606268141593149, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 11160 + }, + { + "epoch": 0.11161, + "grad_norm": 0.679934195703754, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 11161 + }, + { + "epoch": 0.11162, + "grad_norm": 0.7656715332302683, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 11162 + }, + { + "epoch": 0.11163, + "grad_norm": 0.7480394445842258, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 11163 + }, + { + "epoch": 0.11164, + "grad_norm": 0.6028095169334248, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 11164 + }, + { + "epoch": 0.11165, + "grad_norm": 0.6318752878707158, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 11165 + }, + { + "epoch": 0.11166, + "grad_norm": 0.8879033904433962, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 11166 + }, + { + "epoch": 0.11167, + "grad_norm": 1.4274703348853277, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 11167 + }, + { + "epoch": 0.11168, + "grad_norm": 0.6922698324968064, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 11168 + }, + { + "epoch": 0.11169, + "grad_norm": 0.6067024325327948, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 11169 + }, + { + "epoch": 0.1117, + "grad_norm": 0.7340926860018586, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 11170 + }, + { + "epoch": 0.11171, + "grad_norm": 0.8701848545174916, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 11171 + }, + { + "epoch": 0.11172, + "grad_norm": 0.9155073360821716, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 11172 + }, + { + "epoch": 0.11173, + "grad_norm": 0.8873135441014273, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 11173 + }, + { + "epoch": 0.11174, + "grad_norm": 0.8199142395718748, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 11174 + }, + { + "epoch": 0.11175, + "grad_norm": 0.8017348512165051, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 11175 + }, + { + "epoch": 0.11176, + "grad_norm": 0.786558601699622, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 11176 + }, + { + "epoch": 0.11177, + "grad_norm": 0.7339391142845619, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 11177 + }, + { + "epoch": 0.11178, + "grad_norm": 0.6520365160874647, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 11178 + }, + { + "epoch": 0.11179, + "grad_norm": 0.6960486782654498, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 11179 + }, + { + "epoch": 0.1118, + "grad_norm": 0.7364223235410267, + "learning_rate": 0.003, + "loss": 4.08, + "step": 11180 + }, + { + "epoch": 0.11181, + "grad_norm": 0.8184326717593928, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 11181 + }, + { + "epoch": 0.11182, + "grad_norm": 1.0033279367101047, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 11182 + }, + { + "epoch": 0.11183, + "grad_norm": 1.3151922001789536, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 11183 + }, + { + "epoch": 0.11184, + "grad_norm": 0.7681398620969967, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 11184 + }, + { + "epoch": 0.11185, + "grad_norm": 0.6324973183291746, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 11185 + }, + { + "epoch": 0.11186, + "grad_norm": 0.6442181131175263, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 11186 + }, + { + "epoch": 0.11187, + "grad_norm": 0.6632643336699477, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 11187 + }, + { + "epoch": 0.11188, + "grad_norm": 0.737583248978792, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 11188 + }, + { + "epoch": 0.11189, + "grad_norm": 0.9257422382836547, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 11189 + }, + { + "epoch": 0.1119, + "grad_norm": 1.0748123757293582, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 11190 + }, + { + "epoch": 0.11191, + "grad_norm": 1.0669751328722072, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 11191 + }, + { + "epoch": 0.11192, + "grad_norm": 0.9475513799019504, + "learning_rate": 0.003, + "loss": 4.12, + "step": 11192 + }, + { + "epoch": 0.11193, + "grad_norm": 0.980393271477006, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 11193 + }, + { + "epoch": 0.11194, + "grad_norm": 0.9463524671897485, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 11194 + }, + { + "epoch": 0.11195, + "grad_norm": 0.8342265267333907, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 11195 + }, + { + "epoch": 0.11196, + "grad_norm": 0.7999477240923631, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 11196 + }, + { + "epoch": 0.11197, + "grad_norm": 0.8902807757761849, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 11197 + }, + { + "epoch": 0.11198, + "grad_norm": 0.8987577748435858, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 11198 + }, + { + "epoch": 0.11199, + "grad_norm": 0.9852587056057885, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 11199 + }, + { + "epoch": 0.112, + "grad_norm": 1.4745377463079015, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 11200 + }, + { + "epoch": 0.11201, + "grad_norm": 0.6922170191393475, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 11201 + }, + { + "epoch": 0.11202, + "grad_norm": 0.667280520513202, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 11202 + }, + { + "epoch": 0.11203, + "grad_norm": 0.725043754467325, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 11203 + }, + { + "epoch": 0.11204, + "grad_norm": 0.7905874419302064, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 11204 + }, + { + "epoch": 0.11205, + "grad_norm": 0.798668273386535, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 11205 + }, + { + "epoch": 0.11206, + "grad_norm": 0.9353317315582219, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 11206 + }, + { + "epoch": 0.11207, + "grad_norm": 1.1942296263526493, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 11207 + }, + { + "epoch": 0.11208, + "grad_norm": 0.8285060649114675, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 11208 + }, + { + "epoch": 0.11209, + "grad_norm": 0.7602845324086417, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 11209 + }, + { + "epoch": 0.1121, + "grad_norm": 0.7461526534362624, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 11210 + }, + { + "epoch": 0.11211, + "grad_norm": 0.7174801894516294, + "learning_rate": 0.003, + "loss": 4.103, + "step": 11211 + }, + { + "epoch": 0.11212, + "grad_norm": 0.5623731031335691, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 11212 + }, + { + "epoch": 0.11213, + "grad_norm": 0.5256282911301774, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 11213 + }, + { + "epoch": 0.11214, + "grad_norm": 0.524622311054672, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 11214 + }, + { + "epoch": 0.11215, + "grad_norm": 0.5705507694064851, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 11215 + }, + { + "epoch": 0.11216, + "grad_norm": 0.5728925901575492, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 11216 + }, + { + "epoch": 0.11217, + "grad_norm": 0.5692961251822839, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 11217 + }, + { + "epoch": 0.11218, + "grad_norm": 0.6711715531082638, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 11218 + }, + { + "epoch": 0.11219, + "grad_norm": 0.846884977023136, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 11219 + }, + { + "epoch": 0.1122, + "grad_norm": 0.9680363037463088, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 11220 + }, + { + "epoch": 0.11221, + "grad_norm": 1.0420788650409123, + "learning_rate": 0.003, + "loss": 4.085, + "step": 11221 + }, + { + "epoch": 0.11222, + "grad_norm": 0.9004639432683451, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 11222 + }, + { + "epoch": 0.11223, + "grad_norm": 0.8393242438846302, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 11223 + }, + { + "epoch": 0.11224, + "grad_norm": 0.7990820081600976, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 11224 + }, + { + "epoch": 0.11225, + "grad_norm": 0.6574466208192928, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 11225 + }, + { + "epoch": 0.11226, + "grad_norm": 0.6759618474023606, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 11226 + }, + { + "epoch": 0.11227, + "grad_norm": 0.7609637817250507, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 11227 + }, + { + "epoch": 0.11228, + "grad_norm": 0.7661746709409221, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 11228 + }, + { + "epoch": 0.11229, + "grad_norm": 0.6810587343834144, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 11229 + }, + { + "epoch": 0.1123, + "grad_norm": 0.7436685296036734, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 11230 + }, + { + "epoch": 0.11231, + "grad_norm": 0.7175074007556332, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 11231 + }, + { + "epoch": 0.11232, + "grad_norm": 0.8598338665744785, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 11232 + }, + { + "epoch": 0.11233, + "grad_norm": 0.858424185394917, + "learning_rate": 0.003, + "loss": 4.096, + "step": 11233 + }, + { + "epoch": 0.11234, + "grad_norm": 0.8579128611958361, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 11234 + }, + { + "epoch": 0.11235, + "grad_norm": 1.0397658926409876, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 11235 + }, + { + "epoch": 0.11236, + "grad_norm": 1.1354136952993683, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 11236 + }, + { + "epoch": 0.11237, + "grad_norm": 0.9343726823813238, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 11237 + }, + { + "epoch": 0.11238, + "grad_norm": 0.9224423526242113, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 11238 + }, + { + "epoch": 0.11239, + "grad_norm": 1.130748748117673, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 11239 + }, + { + "epoch": 0.1124, + "grad_norm": 1.0017172539268393, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 11240 + }, + { + "epoch": 0.11241, + "grad_norm": 0.9175963249395397, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 11241 + }, + { + "epoch": 0.11242, + "grad_norm": 0.9534075599579533, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 11242 + }, + { + "epoch": 0.11243, + "grad_norm": 0.9189430914908193, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 11243 + }, + { + "epoch": 0.11244, + "grad_norm": 0.7555180429260873, + "learning_rate": 0.003, + "loss": 4.081, + "step": 11244 + }, + { + "epoch": 0.11245, + "grad_norm": 0.7229268696180132, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 11245 + }, + { + "epoch": 0.11246, + "grad_norm": 0.6252394878517432, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 11246 + }, + { + "epoch": 0.11247, + "grad_norm": 0.6029174577675842, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 11247 + }, + { + "epoch": 0.11248, + "grad_norm": 0.6090033628973571, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 11248 + }, + { + "epoch": 0.11249, + "grad_norm": 0.5542984043971567, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 11249 + }, + { + "epoch": 0.1125, + "grad_norm": 0.5694809489509668, + "learning_rate": 0.003, + "loss": 4.055, + "step": 11250 + }, + { + "epoch": 0.11251, + "grad_norm": 0.7028353991716569, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 11251 + }, + { + "epoch": 0.11252, + "grad_norm": 0.9484323487852909, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 11252 + }, + { + "epoch": 0.11253, + "grad_norm": 1.125644085468693, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 11253 + }, + { + "epoch": 0.11254, + "grad_norm": 0.7806047546301479, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 11254 + }, + { + "epoch": 0.11255, + "grad_norm": 0.6341972128498081, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 11255 + }, + { + "epoch": 0.11256, + "grad_norm": 0.6939257636285394, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 11256 + }, + { + "epoch": 0.11257, + "grad_norm": 0.7911168291243957, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 11257 + }, + { + "epoch": 0.11258, + "grad_norm": 0.8498509190820276, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 11258 + }, + { + "epoch": 0.11259, + "grad_norm": 0.935207478747258, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 11259 + }, + { + "epoch": 0.1126, + "grad_norm": 0.9561311217334402, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 11260 + }, + { + "epoch": 0.11261, + "grad_norm": 0.9565326560921091, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 11261 + }, + { + "epoch": 0.11262, + "grad_norm": 0.9369527481049401, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 11262 + }, + { + "epoch": 0.11263, + "grad_norm": 0.8125887510030267, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 11263 + }, + { + "epoch": 0.11264, + "grad_norm": 0.7702865848328564, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 11264 + }, + { + "epoch": 0.11265, + "grad_norm": 0.8334162587414682, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 11265 + }, + { + "epoch": 0.11266, + "grad_norm": 0.7473872641971427, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 11266 + }, + { + "epoch": 0.11267, + "grad_norm": 0.6758426957564395, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 11267 + }, + { + "epoch": 0.11268, + "grad_norm": 0.7094024271823733, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 11268 + }, + { + "epoch": 0.11269, + "grad_norm": 0.6070926927638549, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 11269 + }, + { + "epoch": 0.1127, + "grad_norm": 0.4901913598283984, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 11270 + }, + { + "epoch": 0.11271, + "grad_norm": 0.4675843897479997, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 11271 + }, + { + "epoch": 0.11272, + "grad_norm": 0.4938736742635015, + "learning_rate": 0.003, + "loss": 4.064, + "step": 11272 + }, + { + "epoch": 0.11273, + "grad_norm": 0.5255525500157796, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 11273 + }, + { + "epoch": 0.11274, + "grad_norm": 0.5870623589704691, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 11274 + }, + { + "epoch": 0.11275, + "grad_norm": 0.6229595008991105, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 11275 + }, + { + "epoch": 0.11276, + "grad_norm": 0.6703357191635001, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 11276 + }, + { + "epoch": 0.11277, + "grad_norm": 0.836221722646284, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 11277 + }, + { + "epoch": 0.11278, + "grad_norm": 1.12987602448386, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 11278 + }, + { + "epoch": 0.11279, + "grad_norm": 0.8932128144389052, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 11279 + }, + { + "epoch": 0.1128, + "grad_norm": 0.8636575495307199, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 11280 + }, + { + "epoch": 0.11281, + "grad_norm": 0.8866969543624653, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 11281 + }, + { + "epoch": 0.11282, + "grad_norm": 0.9013365729779537, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 11282 + }, + { + "epoch": 0.11283, + "grad_norm": 0.8704159020885619, + "learning_rate": 0.003, + "loss": 4.09, + "step": 11283 + }, + { + "epoch": 0.11284, + "grad_norm": 0.8582948331776299, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 11284 + }, + { + "epoch": 0.11285, + "grad_norm": 0.7706440632938059, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 11285 + }, + { + "epoch": 0.11286, + "grad_norm": 0.7852780669628593, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 11286 + }, + { + "epoch": 0.11287, + "grad_norm": 0.8040334833067401, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 11287 + }, + { + "epoch": 0.11288, + "grad_norm": 0.6842046423450019, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 11288 + }, + { + "epoch": 0.11289, + "grad_norm": 0.7553569736226131, + "learning_rate": 0.003, + "loss": 4.1, + "step": 11289 + }, + { + "epoch": 0.1129, + "grad_norm": 0.7935338155336644, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 11290 + }, + { + "epoch": 0.11291, + "grad_norm": 0.7883400874254108, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 11291 + }, + { + "epoch": 0.11292, + "grad_norm": 0.8376593583255619, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 11292 + }, + { + "epoch": 0.11293, + "grad_norm": 0.9521589854357583, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 11293 + }, + { + "epoch": 0.11294, + "grad_norm": 1.0937306713955737, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 11294 + }, + { + "epoch": 0.11295, + "grad_norm": 1.161053778562748, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 11295 + }, + { + "epoch": 0.11296, + "grad_norm": 0.924084583616854, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 11296 + }, + { + "epoch": 0.11297, + "grad_norm": 0.8614899789944479, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 11297 + }, + { + "epoch": 0.11298, + "grad_norm": 0.7961809396109322, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 11298 + }, + { + "epoch": 0.11299, + "grad_norm": 0.8153057757029059, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 11299 + }, + { + "epoch": 0.113, + "grad_norm": 0.8197364004628112, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 11300 + }, + { + "epoch": 0.11301, + "grad_norm": 0.7907672894411515, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 11301 + }, + { + "epoch": 0.11302, + "grad_norm": 0.8034674001126053, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 11302 + }, + { + "epoch": 0.11303, + "grad_norm": 0.8567329453989047, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 11303 + }, + { + "epoch": 0.11304, + "grad_norm": 0.7296740606258915, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 11304 + }, + { + "epoch": 0.11305, + "grad_norm": 0.7929787988472935, + "learning_rate": 0.003, + "loss": 4.067, + "step": 11305 + }, + { + "epoch": 0.11306, + "grad_norm": 0.7956940101721797, + "learning_rate": 0.003, + "loss": 4.063, + "step": 11306 + }, + { + "epoch": 0.11307, + "grad_norm": 0.832525317844469, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 11307 + }, + { + "epoch": 0.11308, + "grad_norm": 1.0211642140432524, + "learning_rate": 0.003, + "loss": 4.084, + "step": 11308 + }, + { + "epoch": 0.11309, + "grad_norm": 1.2631486891454777, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 11309 + }, + { + "epoch": 0.1131, + "grad_norm": 0.6534457353954479, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 11310 + }, + { + "epoch": 0.11311, + "grad_norm": 0.641428212190924, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 11311 + }, + { + "epoch": 0.11312, + "grad_norm": 0.6071836945502547, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 11312 + }, + { + "epoch": 0.11313, + "grad_norm": 0.6827199782593909, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 11313 + }, + { + "epoch": 0.11314, + "grad_norm": 0.7595211649403993, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 11314 + }, + { + "epoch": 0.11315, + "grad_norm": 0.7847473307102096, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 11315 + }, + { + "epoch": 0.11316, + "grad_norm": 0.8610672104104965, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 11316 + }, + { + "epoch": 0.11317, + "grad_norm": 1.0152458189238245, + "learning_rate": 0.003, + "loss": 4.094, + "step": 11317 + }, + { + "epoch": 0.11318, + "grad_norm": 1.0270125585690475, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 11318 + }, + { + "epoch": 0.11319, + "grad_norm": 0.7529031368887859, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 11319 + }, + { + "epoch": 0.1132, + "grad_norm": 0.7652686727641066, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 11320 + }, + { + "epoch": 0.11321, + "grad_norm": 0.7134228153263527, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 11321 + }, + { + "epoch": 0.11322, + "grad_norm": 0.749176398389972, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 11322 + }, + { + "epoch": 0.11323, + "grad_norm": 0.8053900797064021, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 11323 + }, + { + "epoch": 0.11324, + "grad_norm": 0.7753485199215412, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 11324 + }, + { + "epoch": 0.11325, + "grad_norm": 0.7849386213255387, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 11325 + }, + { + "epoch": 0.11326, + "grad_norm": 0.7789568595648719, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 11326 + }, + { + "epoch": 0.11327, + "grad_norm": 0.714660474310829, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 11327 + }, + { + "epoch": 0.11328, + "grad_norm": 0.6688504890705055, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 11328 + }, + { + "epoch": 0.11329, + "grad_norm": 0.6702221359575178, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 11329 + }, + { + "epoch": 0.1133, + "grad_norm": 0.7804814525640149, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 11330 + }, + { + "epoch": 0.11331, + "grad_norm": 0.766311055294654, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 11331 + }, + { + "epoch": 0.11332, + "grad_norm": 0.9002333596500396, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 11332 + }, + { + "epoch": 0.11333, + "grad_norm": 1.1717653891998372, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 11333 + }, + { + "epoch": 0.11334, + "grad_norm": 1.029720674784894, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 11334 + }, + { + "epoch": 0.11335, + "grad_norm": 0.9739791263307555, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 11335 + }, + { + "epoch": 0.11336, + "grad_norm": 0.9242582255769969, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 11336 + }, + { + "epoch": 0.11337, + "grad_norm": 0.7951647257658111, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 11337 + }, + { + "epoch": 0.11338, + "grad_norm": 0.7120786303604073, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 11338 + }, + { + "epoch": 0.11339, + "grad_norm": 0.712282867173393, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 11339 + }, + { + "epoch": 0.1134, + "grad_norm": 0.7880488880259838, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 11340 + }, + { + "epoch": 0.11341, + "grad_norm": 0.9025993455958837, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 11341 + }, + { + "epoch": 0.11342, + "grad_norm": 1.0478701099210872, + "learning_rate": 0.003, + "loss": 4.076, + "step": 11342 + }, + { + "epoch": 0.11343, + "grad_norm": 0.8847627483247708, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 11343 + }, + { + "epoch": 0.11344, + "grad_norm": 0.960637853491343, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 11344 + }, + { + "epoch": 0.11345, + "grad_norm": 0.9450037331480519, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 11345 + }, + { + "epoch": 0.11346, + "grad_norm": 0.8502331559297919, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 11346 + }, + { + "epoch": 0.11347, + "grad_norm": 0.8682577083722111, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 11347 + }, + { + "epoch": 0.11348, + "grad_norm": 0.659113384235199, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 11348 + }, + { + "epoch": 0.11349, + "grad_norm": 0.6504104730803049, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 11349 + }, + { + "epoch": 0.1135, + "grad_norm": 0.6787150947859348, + "learning_rate": 0.003, + "loss": 4.1053, + "step": 11350 + }, + { + "epoch": 0.11351, + "grad_norm": 0.7143052930987581, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 11351 + }, + { + "epoch": 0.11352, + "grad_norm": 0.8164438634612279, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 11352 + }, + { + "epoch": 0.11353, + "grad_norm": 1.0760369267228487, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 11353 + }, + { + "epoch": 0.11354, + "grad_norm": 1.0935038590588513, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 11354 + }, + { + "epoch": 0.11355, + "grad_norm": 0.8753571065441305, + "learning_rate": 0.003, + "loss": 4.12, + "step": 11355 + }, + { + "epoch": 0.11356, + "grad_norm": 0.8593670492792348, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 11356 + }, + { + "epoch": 0.11357, + "grad_norm": 0.8893452051885853, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 11357 + }, + { + "epoch": 0.11358, + "grad_norm": 1.0189217114351887, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 11358 + }, + { + "epoch": 0.11359, + "grad_norm": 0.9570621972206529, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 11359 + }, + { + "epoch": 0.1136, + "grad_norm": 0.8724005228517467, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 11360 + }, + { + "epoch": 0.11361, + "grad_norm": 0.9174533262178383, + "learning_rate": 0.003, + "loss": 4.064, + "step": 11361 + }, + { + "epoch": 0.11362, + "grad_norm": 0.932112927523047, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 11362 + }, + { + "epoch": 0.11363, + "grad_norm": 0.8755211146818656, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 11363 + }, + { + "epoch": 0.11364, + "grad_norm": 0.8389905241810933, + "learning_rate": 0.003, + "loss": 4.103, + "step": 11364 + }, + { + "epoch": 0.11365, + "grad_norm": 0.7005566512124908, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 11365 + }, + { + "epoch": 0.11366, + "grad_norm": 0.6535254256303773, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 11366 + }, + { + "epoch": 0.11367, + "grad_norm": 0.6972575428435476, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 11367 + }, + { + "epoch": 0.11368, + "grad_norm": 0.6107473289458821, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 11368 + }, + { + "epoch": 0.11369, + "grad_norm": 0.6124903478646556, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 11369 + }, + { + "epoch": 0.1137, + "grad_norm": 0.5656801850453923, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 11370 + }, + { + "epoch": 0.11371, + "grad_norm": 0.595358154993407, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 11371 + }, + { + "epoch": 0.11372, + "grad_norm": 0.6767828600929685, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 11372 + }, + { + "epoch": 0.11373, + "grad_norm": 0.6461746119819802, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 11373 + }, + { + "epoch": 0.11374, + "grad_norm": 0.604714571043439, + "learning_rate": 0.003, + "loss": 4.078, + "step": 11374 + }, + { + "epoch": 0.11375, + "grad_norm": 0.48164072007418185, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 11375 + }, + { + "epoch": 0.11376, + "grad_norm": 0.4519231553429714, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 11376 + }, + { + "epoch": 0.11377, + "grad_norm": 0.4901757436491158, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 11377 + }, + { + "epoch": 0.11378, + "grad_norm": 0.5499550851093622, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 11378 + }, + { + "epoch": 0.11379, + "grad_norm": 0.7378779725745853, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 11379 + }, + { + "epoch": 0.1138, + "grad_norm": 0.9744528980513482, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 11380 + }, + { + "epoch": 0.11381, + "grad_norm": 1.1621155727424501, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 11381 + }, + { + "epoch": 0.11382, + "grad_norm": 0.8383393396549385, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 11382 + }, + { + "epoch": 0.11383, + "grad_norm": 0.9902959766843705, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 11383 + }, + { + "epoch": 0.11384, + "grad_norm": 1.1649706827514636, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 11384 + }, + { + "epoch": 0.11385, + "grad_norm": 0.9401904170546886, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 11385 + }, + { + "epoch": 0.11386, + "grad_norm": 0.8918665757510584, + "learning_rate": 0.003, + "loss": 4.097, + "step": 11386 + }, + { + "epoch": 0.11387, + "grad_norm": 0.7858255897476053, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 11387 + }, + { + "epoch": 0.11388, + "grad_norm": 0.8812672645432824, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 11388 + }, + { + "epoch": 0.11389, + "grad_norm": 0.974005966509601, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 11389 + }, + { + "epoch": 0.1139, + "grad_norm": 0.9951555972112742, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 11390 + }, + { + "epoch": 0.11391, + "grad_norm": 0.7582466738628498, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 11391 + }, + { + "epoch": 0.11392, + "grad_norm": 0.7432719879616099, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 11392 + }, + { + "epoch": 0.11393, + "grad_norm": 0.8660936215295046, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 11393 + }, + { + "epoch": 0.11394, + "grad_norm": 0.8426330885658078, + "learning_rate": 0.003, + "loss": 4.057, + "step": 11394 + }, + { + "epoch": 0.11395, + "grad_norm": 0.7299468081687056, + "learning_rate": 0.003, + "loss": 4.087, + "step": 11395 + }, + { + "epoch": 0.11396, + "grad_norm": 0.8781582130259117, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 11396 + }, + { + "epoch": 0.11397, + "grad_norm": 1.0520706764406507, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 11397 + }, + { + "epoch": 0.11398, + "grad_norm": 1.1643736381328387, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 11398 + }, + { + "epoch": 0.11399, + "grad_norm": 0.8314606737771553, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 11399 + }, + { + "epoch": 0.114, + "grad_norm": 0.6798385625229799, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 11400 + }, + { + "epoch": 0.11401, + "grad_norm": 0.6336333271230933, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 11401 + }, + { + "epoch": 0.11402, + "grad_norm": 0.6573029968753343, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 11402 + }, + { + "epoch": 0.11403, + "grad_norm": 0.5896113626429579, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 11403 + }, + { + "epoch": 0.11404, + "grad_norm": 0.6243386792726651, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 11404 + }, + { + "epoch": 0.11405, + "grad_norm": 0.6821108719160749, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 11405 + }, + { + "epoch": 0.11406, + "grad_norm": 0.8538350667629351, + "learning_rate": 0.003, + "loss": 4.105, + "step": 11406 + }, + { + "epoch": 0.11407, + "grad_norm": 1.0878784230608254, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 11407 + }, + { + "epoch": 0.11408, + "grad_norm": 0.9450618160165626, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 11408 + }, + { + "epoch": 0.11409, + "grad_norm": 0.8358195561086937, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 11409 + }, + { + "epoch": 0.1141, + "grad_norm": 0.7769456605950715, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 11410 + }, + { + "epoch": 0.11411, + "grad_norm": 0.6725881254357532, + "learning_rate": 0.003, + "loss": 4.094, + "step": 11411 + }, + { + "epoch": 0.11412, + "grad_norm": 0.6317084075870764, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 11412 + }, + { + "epoch": 0.11413, + "grad_norm": 0.6892400591194415, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 11413 + }, + { + "epoch": 0.11414, + "grad_norm": 0.686994212866962, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 11414 + }, + { + "epoch": 0.11415, + "grad_norm": 0.7663353308798015, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 11415 + }, + { + "epoch": 0.11416, + "grad_norm": 0.8166295001650846, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 11416 + }, + { + "epoch": 0.11417, + "grad_norm": 0.8184869720580596, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 11417 + }, + { + "epoch": 0.11418, + "grad_norm": 0.838286743096995, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 11418 + }, + { + "epoch": 0.11419, + "grad_norm": 0.8381618543067891, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 11419 + }, + { + "epoch": 0.1142, + "grad_norm": 0.8838017739991473, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 11420 + }, + { + "epoch": 0.11421, + "grad_norm": 1.0594840809211652, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 11421 + }, + { + "epoch": 0.11422, + "grad_norm": 1.063582213055678, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 11422 + }, + { + "epoch": 0.11423, + "grad_norm": 0.969316505108481, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 11423 + }, + { + "epoch": 0.11424, + "grad_norm": 0.8624992829553142, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 11424 + }, + { + "epoch": 0.11425, + "grad_norm": 0.8210335247117174, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 11425 + }, + { + "epoch": 0.11426, + "grad_norm": 0.7752259282457125, + "learning_rate": 0.003, + "loss": 4.1094, + "step": 11426 + }, + { + "epoch": 0.11427, + "grad_norm": 0.7212696292472102, + "learning_rate": 0.003, + "loss": 4.078, + "step": 11427 + }, + { + "epoch": 0.11428, + "grad_norm": 0.7398697828017103, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 11428 + }, + { + "epoch": 0.11429, + "grad_norm": 0.838665969764261, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 11429 + }, + { + "epoch": 0.1143, + "grad_norm": 1.0609132590161845, + "learning_rate": 0.003, + "loss": 4.105, + "step": 11430 + }, + { + "epoch": 0.11431, + "grad_norm": 1.0204566381125515, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 11431 + }, + { + "epoch": 0.11432, + "grad_norm": 0.9229099605964733, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 11432 + }, + { + "epoch": 0.11433, + "grad_norm": 0.8108948768192613, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 11433 + }, + { + "epoch": 0.11434, + "grad_norm": 0.8017151237202406, + "learning_rate": 0.003, + "loss": 4.1152, + "step": 11434 + }, + { + "epoch": 0.11435, + "grad_norm": 0.8134963922170028, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 11435 + }, + { + "epoch": 0.11436, + "grad_norm": 0.8594932491854741, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 11436 + }, + { + "epoch": 0.11437, + "grad_norm": 0.8605179134609567, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 11437 + }, + { + "epoch": 0.11438, + "grad_norm": 0.7706493151867468, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 11438 + }, + { + "epoch": 0.11439, + "grad_norm": 0.7876193908634262, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 11439 + }, + { + "epoch": 0.1144, + "grad_norm": 0.7417563149358162, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 11440 + }, + { + "epoch": 0.11441, + "grad_norm": 0.6831653796726206, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 11441 + }, + { + "epoch": 0.11442, + "grad_norm": 0.716445953485537, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 11442 + }, + { + "epoch": 0.11443, + "grad_norm": 0.7845975212772534, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 11443 + }, + { + "epoch": 0.11444, + "grad_norm": 0.8859188228778797, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 11444 + }, + { + "epoch": 0.11445, + "grad_norm": 1.0662886467358412, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 11445 + }, + { + "epoch": 0.11446, + "grad_norm": 1.0791351632470838, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 11446 + }, + { + "epoch": 0.11447, + "grad_norm": 0.8063903960219335, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 11447 + }, + { + "epoch": 0.11448, + "grad_norm": 0.7612790472500267, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 11448 + }, + { + "epoch": 0.11449, + "grad_norm": 0.8204595294586179, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 11449 + }, + { + "epoch": 0.1145, + "grad_norm": 0.9567275578093394, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 11450 + }, + { + "epoch": 0.11451, + "grad_norm": 1.041839573054231, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 11451 + }, + { + "epoch": 0.11452, + "grad_norm": 1.0196914215484494, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 11452 + }, + { + "epoch": 0.11453, + "grad_norm": 1.006303329412706, + "learning_rate": 0.003, + "loss": 4.085, + "step": 11453 + }, + { + "epoch": 0.11454, + "grad_norm": 0.809225131837401, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 11454 + }, + { + "epoch": 0.11455, + "grad_norm": 0.7460555423548791, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 11455 + }, + { + "epoch": 0.11456, + "grad_norm": 0.7670282450276944, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 11456 + }, + { + "epoch": 0.11457, + "grad_norm": 0.7274292844432366, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 11457 + }, + { + "epoch": 0.11458, + "grad_norm": 0.6995337734925635, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 11458 + }, + { + "epoch": 0.11459, + "grad_norm": 0.7095202336297924, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 11459 + }, + { + "epoch": 0.1146, + "grad_norm": 0.7245500167322204, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 11460 + }, + { + "epoch": 0.11461, + "grad_norm": 0.7527717681841568, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 11461 + }, + { + "epoch": 0.11462, + "grad_norm": 0.8087152381735974, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 11462 + }, + { + "epoch": 0.11463, + "grad_norm": 0.7632330734645594, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 11463 + }, + { + "epoch": 0.11464, + "grad_norm": 0.7753557989741182, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 11464 + }, + { + "epoch": 0.11465, + "grad_norm": 0.6697990719628059, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 11465 + }, + { + "epoch": 0.11466, + "grad_norm": 0.6008005372030594, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 11466 + }, + { + "epoch": 0.11467, + "grad_norm": 0.569637208753584, + "learning_rate": 0.003, + "loss": 4.087, + "step": 11467 + }, + { + "epoch": 0.11468, + "grad_norm": 0.5517799530171552, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 11468 + }, + { + "epoch": 0.11469, + "grad_norm": 0.6074959405867231, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 11469 + }, + { + "epoch": 0.1147, + "grad_norm": 0.6098855376813679, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 11470 + }, + { + "epoch": 0.11471, + "grad_norm": 0.6204049954948289, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 11471 + }, + { + "epoch": 0.11472, + "grad_norm": 0.6263198106194131, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 11472 + }, + { + "epoch": 0.11473, + "grad_norm": 0.678838673184852, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 11473 + }, + { + "epoch": 0.11474, + "grad_norm": 0.7927653456883111, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 11474 + }, + { + "epoch": 0.11475, + "grad_norm": 0.9809518620434418, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 11475 + }, + { + "epoch": 0.11476, + "grad_norm": 1.2903461691365623, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 11476 + }, + { + "epoch": 0.11477, + "grad_norm": 0.6913112712839737, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 11477 + }, + { + "epoch": 0.11478, + "grad_norm": 0.6541500342928829, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 11478 + }, + { + "epoch": 0.11479, + "grad_norm": 0.7000564726804843, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 11479 + }, + { + "epoch": 0.1148, + "grad_norm": 0.8708818991587933, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 11480 + }, + { + "epoch": 0.11481, + "grad_norm": 1.0678161831646351, + "learning_rate": 0.003, + "loss": 4.067, + "step": 11481 + }, + { + "epoch": 0.11482, + "grad_norm": 0.9724996724592337, + "learning_rate": 0.003, + "loss": 4.1138, + "step": 11482 + }, + { + "epoch": 0.11483, + "grad_norm": 0.9324828781779055, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 11483 + }, + { + "epoch": 0.11484, + "grad_norm": 0.9503019643609745, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 11484 + }, + { + "epoch": 0.11485, + "grad_norm": 1.0400641794373122, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 11485 + }, + { + "epoch": 0.11486, + "grad_norm": 1.052286895527032, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 11486 + }, + { + "epoch": 0.11487, + "grad_norm": 1.2140198115974443, + "learning_rate": 0.003, + "loss": 4.1122, + "step": 11487 + }, + { + "epoch": 0.11488, + "grad_norm": 0.936990996016995, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 11488 + }, + { + "epoch": 0.11489, + "grad_norm": 1.026410885502423, + "learning_rate": 0.003, + "loss": 4.1034, + "step": 11489 + }, + { + "epoch": 0.1149, + "grad_norm": 0.9947215426823502, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 11490 + }, + { + "epoch": 0.11491, + "grad_norm": 0.9054827913364408, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 11491 + }, + { + "epoch": 0.11492, + "grad_norm": 0.9483496834478657, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 11492 + }, + { + "epoch": 0.11493, + "grad_norm": 0.9246213336036101, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 11493 + }, + { + "epoch": 0.11494, + "grad_norm": 0.9009007669468101, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 11494 + }, + { + "epoch": 0.11495, + "grad_norm": 0.823448429847814, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 11495 + }, + { + "epoch": 0.11496, + "grad_norm": 0.7722589344197481, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 11496 + }, + { + "epoch": 0.11497, + "grad_norm": 0.8731844529480173, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 11497 + }, + { + "epoch": 0.11498, + "grad_norm": 1.0649491375982316, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 11498 + }, + { + "epoch": 0.11499, + "grad_norm": 0.99257363481656, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 11499 + }, + { + "epoch": 0.115, + "grad_norm": 1.1228448261634618, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 11500 + }, + { + "epoch": 0.11501, + "grad_norm": 0.914708366743979, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 11501 + }, + { + "epoch": 0.11502, + "grad_norm": 0.7990894752219588, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 11502 + }, + { + "epoch": 0.11503, + "grad_norm": 0.8634859028430474, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 11503 + }, + { + "epoch": 0.11504, + "grad_norm": 0.8209522211301438, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 11504 + }, + { + "epoch": 0.11505, + "grad_norm": 0.896357133679053, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 11505 + }, + { + "epoch": 0.11506, + "grad_norm": 0.9311317378566581, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 11506 + }, + { + "epoch": 0.11507, + "grad_norm": 0.836624753600562, + "learning_rate": 0.003, + "loss": 4.111, + "step": 11507 + }, + { + "epoch": 0.11508, + "grad_norm": 0.7488419342087322, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 11508 + }, + { + "epoch": 0.11509, + "grad_norm": 0.8470336248385015, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 11509 + }, + { + "epoch": 0.1151, + "grad_norm": 0.6993378176385352, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 11510 + }, + { + "epoch": 0.11511, + "grad_norm": 0.6636735261002701, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 11511 + }, + { + "epoch": 0.11512, + "grad_norm": 0.7075834385932517, + "learning_rate": 0.003, + "loss": 4.071, + "step": 11512 + }, + { + "epoch": 0.11513, + "grad_norm": 0.8777037605850598, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 11513 + }, + { + "epoch": 0.11514, + "grad_norm": 1.0888790860680126, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 11514 + }, + { + "epoch": 0.11515, + "grad_norm": 0.7811499622133515, + "learning_rate": 0.003, + "loss": 4.119, + "step": 11515 + }, + { + "epoch": 0.11516, + "grad_norm": 0.587781962912552, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 11516 + }, + { + "epoch": 0.11517, + "grad_norm": 0.567097188926201, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 11517 + }, + { + "epoch": 0.11518, + "grad_norm": 0.5739870205232486, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 11518 + }, + { + "epoch": 0.11519, + "grad_norm": 0.5813935837226678, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 11519 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5907545971598139, + "learning_rate": 0.003, + "loss": 4.039, + "step": 11520 + }, + { + "epoch": 0.11521, + "grad_norm": 0.5751735671742095, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 11521 + }, + { + "epoch": 0.11522, + "grad_norm": 0.6059075134416301, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 11522 + }, + { + "epoch": 0.11523, + "grad_norm": 0.5832577261226649, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 11523 + }, + { + "epoch": 0.11524, + "grad_norm": 0.6607645711799944, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 11524 + }, + { + "epoch": 0.11525, + "grad_norm": 0.8334570906671758, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 11525 + }, + { + "epoch": 0.11526, + "grad_norm": 1.0909624802832556, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 11526 + }, + { + "epoch": 0.11527, + "grad_norm": 0.8976589256201832, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 11527 + }, + { + "epoch": 0.11528, + "grad_norm": 0.8247848124617317, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 11528 + }, + { + "epoch": 0.11529, + "grad_norm": 0.7003087941348926, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 11529 + }, + { + "epoch": 0.1153, + "grad_norm": 0.6668416364375418, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 11530 + }, + { + "epoch": 0.11531, + "grad_norm": 0.6635552982535685, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 11531 + }, + { + "epoch": 0.11532, + "grad_norm": 0.6895585494087922, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 11532 + }, + { + "epoch": 0.11533, + "grad_norm": 0.7200153011206304, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 11533 + }, + { + "epoch": 0.11534, + "grad_norm": 0.7002579219392758, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 11534 + }, + { + "epoch": 0.11535, + "grad_norm": 0.7115627139627968, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 11535 + }, + { + "epoch": 0.11536, + "grad_norm": 0.6951597815398586, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 11536 + }, + { + "epoch": 0.11537, + "grad_norm": 0.7358087564040365, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 11537 + }, + { + "epoch": 0.11538, + "grad_norm": 0.8336474073285849, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 11538 + }, + { + "epoch": 0.11539, + "grad_norm": 0.8213311226682373, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 11539 + }, + { + "epoch": 0.1154, + "grad_norm": 0.8566023662184081, + "learning_rate": 0.003, + "loss": 4.1, + "step": 11540 + }, + { + "epoch": 0.11541, + "grad_norm": 0.967516698557583, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 11541 + }, + { + "epoch": 0.11542, + "grad_norm": 1.1379133822032814, + "learning_rate": 0.003, + "loss": 4.1053, + "step": 11542 + }, + { + "epoch": 0.11543, + "grad_norm": 0.8990012504669521, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 11543 + }, + { + "epoch": 0.11544, + "grad_norm": 0.7155605857504481, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 11544 + }, + { + "epoch": 0.11545, + "grad_norm": 0.6319060568344189, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 11545 + }, + { + "epoch": 0.11546, + "grad_norm": 0.7034897758309688, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 11546 + }, + { + "epoch": 0.11547, + "grad_norm": 0.8461973251976806, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 11547 + }, + { + "epoch": 0.11548, + "grad_norm": 0.908594053225602, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 11548 + }, + { + "epoch": 0.11549, + "grad_norm": 0.8941198601074396, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 11549 + }, + { + "epoch": 0.1155, + "grad_norm": 1.0641433609898754, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 11550 + }, + { + "epoch": 0.11551, + "grad_norm": 1.1226709769273038, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 11551 + }, + { + "epoch": 0.11552, + "grad_norm": 0.8744132912716648, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 11552 + }, + { + "epoch": 0.11553, + "grad_norm": 0.8749775551915074, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 11553 + }, + { + "epoch": 0.11554, + "grad_norm": 1.03868006630079, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 11554 + }, + { + "epoch": 0.11555, + "grad_norm": 1.1003305815443707, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 11555 + }, + { + "epoch": 0.11556, + "grad_norm": 0.7187702623784662, + "learning_rate": 0.003, + "loss": 4.057, + "step": 11556 + }, + { + "epoch": 0.11557, + "grad_norm": 0.6407519820442772, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 11557 + }, + { + "epoch": 0.11558, + "grad_norm": 0.607065342726708, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 11558 + }, + { + "epoch": 0.11559, + "grad_norm": 0.6156699142506642, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 11559 + }, + { + "epoch": 0.1156, + "grad_norm": 0.7095856346271662, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 11560 + }, + { + "epoch": 0.11561, + "grad_norm": 0.8191987480258054, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 11561 + }, + { + "epoch": 0.11562, + "grad_norm": 0.9460987749435492, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 11562 + }, + { + "epoch": 0.11563, + "grad_norm": 1.1061407229901132, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 11563 + }, + { + "epoch": 0.11564, + "grad_norm": 0.9598519414792321, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 11564 + }, + { + "epoch": 0.11565, + "grad_norm": 1.0113948710353295, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 11565 + }, + { + "epoch": 0.11566, + "grad_norm": 0.9985959696805135, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 11566 + }, + { + "epoch": 0.11567, + "grad_norm": 0.9293996191943726, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 11567 + }, + { + "epoch": 0.11568, + "grad_norm": 0.8471543203842765, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 11568 + }, + { + "epoch": 0.11569, + "grad_norm": 0.7518687221166575, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 11569 + }, + { + "epoch": 0.1157, + "grad_norm": 0.7235992344577318, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 11570 + }, + { + "epoch": 0.11571, + "grad_norm": 0.690149421887094, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 11571 + }, + { + "epoch": 0.11572, + "grad_norm": 0.6652721769625423, + "learning_rate": 0.003, + "loss": 4.035, + "step": 11572 + }, + { + "epoch": 0.11573, + "grad_norm": 0.6896088693903043, + "learning_rate": 0.003, + "loss": 4.089, + "step": 11573 + }, + { + "epoch": 0.11574, + "grad_norm": 0.6949121480552437, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 11574 + }, + { + "epoch": 0.11575, + "grad_norm": 0.8298077286079635, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 11575 + }, + { + "epoch": 0.11576, + "grad_norm": 0.8883417883490657, + "learning_rate": 0.003, + "loss": 4.049, + "step": 11576 + }, + { + "epoch": 0.11577, + "grad_norm": 0.9051304976104619, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 11577 + }, + { + "epoch": 0.11578, + "grad_norm": 0.8752536710071942, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 11578 + }, + { + "epoch": 0.11579, + "grad_norm": 0.7653910663712051, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 11579 + }, + { + "epoch": 0.1158, + "grad_norm": 0.7465481995480454, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 11580 + }, + { + "epoch": 0.11581, + "grad_norm": 0.9199637978386034, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 11581 + }, + { + "epoch": 0.11582, + "grad_norm": 1.0783217332956068, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 11582 + }, + { + "epoch": 0.11583, + "grad_norm": 1.1107501153614088, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 11583 + }, + { + "epoch": 0.11584, + "grad_norm": 0.8180991703205399, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 11584 + }, + { + "epoch": 0.11585, + "grad_norm": 0.8100778301580475, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 11585 + }, + { + "epoch": 0.11586, + "grad_norm": 0.831553150897628, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 11586 + }, + { + "epoch": 0.11587, + "grad_norm": 0.6817930472661543, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 11587 + }, + { + "epoch": 0.11588, + "grad_norm": 0.5727527524744273, + "learning_rate": 0.003, + "loss": 4.082, + "step": 11588 + }, + { + "epoch": 0.11589, + "grad_norm": 0.6316855955481255, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 11589 + }, + { + "epoch": 0.1159, + "grad_norm": 0.6752325329450864, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 11590 + }, + { + "epoch": 0.11591, + "grad_norm": 0.7232173881070808, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 11591 + }, + { + "epoch": 0.11592, + "grad_norm": 0.7063185542230442, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 11592 + }, + { + "epoch": 0.11593, + "grad_norm": 0.7074532846256573, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 11593 + }, + { + "epoch": 0.11594, + "grad_norm": 0.7188476307054906, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 11594 + }, + { + "epoch": 0.11595, + "grad_norm": 0.7967017363971508, + "learning_rate": 0.003, + "loss": 4.096, + "step": 11595 + }, + { + "epoch": 0.11596, + "grad_norm": 0.7979659641125272, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 11596 + }, + { + "epoch": 0.11597, + "grad_norm": 0.7503354963245181, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 11597 + }, + { + "epoch": 0.11598, + "grad_norm": 0.6968784511387319, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 11598 + }, + { + "epoch": 0.11599, + "grad_norm": 0.7322551139611673, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 11599 + }, + { + "epoch": 0.116, + "grad_norm": 0.7829563677335902, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 11600 + }, + { + "epoch": 0.11601, + "grad_norm": 0.8406288218812696, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 11601 + }, + { + "epoch": 0.11602, + "grad_norm": 0.907358702251944, + "learning_rate": 0.003, + "loss": 4.051, + "step": 11602 + }, + { + "epoch": 0.11603, + "grad_norm": 1.104466090694656, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 11603 + }, + { + "epoch": 0.11604, + "grad_norm": 1.1445811229811231, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 11604 + }, + { + "epoch": 0.11605, + "grad_norm": 1.086292276099412, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 11605 + }, + { + "epoch": 0.11606, + "grad_norm": 1.0248341720469938, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 11606 + }, + { + "epoch": 0.11607, + "grad_norm": 0.8936072607044433, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 11607 + }, + { + "epoch": 0.11608, + "grad_norm": 0.6640762927725768, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 11608 + }, + { + "epoch": 0.11609, + "grad_norm": 0.5929874444616173, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 11609 + }, + { + "epoch": 0.1161, + "grad_norm": 0.7676302133161541, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 11610 + }, + { + "epoch": 0.11611, + "grad_norm": 0.9772519395033423, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 11611 + }, + { + "epoch": 0.11612, + "grad_norm": 1.1722821627389872, + "learning_rate": 0.003, + "loss": 4.1121, + "step": 11612 + }, + { + "epoch": 0.11613, + "grad_norm": 0.8317669096635092, + "learning_rate": 0.003, + "loss": 4.065, + "step": 11613 + }, + { + "epoch": 0.11614, + "grad_norm": 0.7171157192926751, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 11614 + }, + { + "epoch": 0.11615, + "grad_norm": 0.6794332467164689, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 11615 + }, + { + "epoch": 0.11616, + "grad_norm": 0.6686040048708934, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 11616 + }, + { + "epoch": 0.11617, + "grad_norm": 0.6649123255778612, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 11617 + }, + { + "epoch": 0.11618, + "grad_norm": 0.6327469112612996, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 11618 + }, + { + "epoch": 0.11619, + "grad_norm": 0.6733908474561229, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 11619 + }, + { + "epoch": 0.1162, + "grad_norm": 0.9038689078618168, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 11620 + }, + { + "epoch": 0.11621, + "grad_norm": 0.9877567335502722, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 11621 + }, + { + "epoch": 0.11622, + "grad_norm": 1.0269046635334127, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 11622 + }, + { + "epoch": 0.11623, + "grad_norm": 1.2003887721207918, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 11623 + }, + { + "epoch": 0.11624, + "grad_norm": 0.8284791223906107, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 11624 + }, + { + "epoch": 0.11625, + "grad_norm": 0.7200125040651179, + "learning_rate": 0.003, + "loss": 4.069, + "step": 11625 + }, + { + "epoch": 0.11626, + "grad_norm": 0.7398519699744737, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 11626 + }, + { + "epoch": 0.11627, + "grad_norm": 0.781547249335416, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 11627 + }, + { + "epoch": 0.11628, + "grad_norm": 0.9735491541443749, + "learning_rate": 0.003, + "loss": 4.1225, + "step": 11628 + }, + { + "epoch": 0.11629, + "grad_norm": 1.145587338551523, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 11629 + }, + { + "epoch": 0.1163, + "grad_norm": 0.8653126775605255, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 11630 + }, + { + "epoch": 0.11631, + "grad_norm": 0.7247010628569122, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 11631 + }, + { + "epoch": 0.11632, + "grad_norm": 0.6422645539925438, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 11632 + }, + { + "epoch": 0.11633, + "grad_norm": 0.6621774036007515, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 11633 + }, + { + "epoch": 0.11634, + "grad_norm": 0.79779597288848, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 11634 + }, + { + "epoch": 0.11635, + "grad_norm": 0.8893518897756485, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 11635 + }, + { + "epoch": 0.11636, + "grad_norm": 0.8572953980170954, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 11636 + }, + { + "epoch": 0.11637, + "grad_norm": 0.8296041187898088, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 11637 + }, + { + "epoch": 0.11638, + "grad_norm": 0.8157149630498728, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 11638 + }, + { + "epoch": 0.11639, + "grad_norm": 0.8485733672985765, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 11639 + }, + { + "epoch": 0.1164, + "grad_norm": 0.8444919120692772, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 11640 + }, + { + "epoch": 0.11641, + "grad_norm": 0.9167579257168221, + "learning_rate": 0.003, + "loss": 4.09, + "step": 11641 + }, + { + "epoch": 0.11642, + "grad_norm": 1.0334298143941645, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 11642 + }, + { + "epoch": 0.11643, + "grad_norm": 0.9996522485793898, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 11643 + }, + { + "epoch": 0.11644, + "grad_norm": 0.9191598889332477, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 11644 + }, + { + "epoch": 0.11645, + "grad_norm": 0.786355606103955, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 11645 + }, + { + "epoch": 0.11646, + "grad_norm": 0.7874834113515161, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 11646 + }, + { + "epoch": 0.11647, + "grad_norm": 0.8462857762931152, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 11647 + }, + { + "epoch": 0.11648, + "grad_norm": 0.7219922126008644, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 11648 + }, + { + "epoch": 0.11649, + "grad_norm": 0.7870218836101154, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 11649 + }, + { + "epoch": 0.1165, + "grad_norm": 0.7831053281586716, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 11650 + }, + { + "epoch": 0.11651, + "grad_norm": 0.8416193416227049, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 11651 + }, + { + "epoch": 0.11652, + "grad_norm": 0.9220794419288686, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 11652 + }, + { + "epoch": 0.11653, + "grad_norm": 0.927703344770682, + "learning_rate": 0.003, + "loss": 4.093, + "step": 11653 + }, + { + "epoch": 0.11654, + "grad_norm": 0.9093528793704585, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 11654 + }, + { + "epoch": 0.11655, + "grad_norm": 0.76695683270814, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 11655 + }, + { + "epoch": 0.11656, + "grad_norm": 0.7569008148443639, + "learning_rate": 0.003, + "loss": 4.075, + "step": 11656 + }, + { + "epoch": 0.11657, + "grad_norm": 0.7569148698989775, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 11657 + }, + { + "epoch": 0.11658, + "grad_norm": 0.8316401753149308, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 11658 + }, + { + "epoch": 0.11659, + "grad_norm": 0.855298459137231, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 11659 + }, + { + "epoch": 0.1166, + "grad_norm": 0.7696793683113001, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 11660 + }, + { + "epoch": 0.11661, + "grad_norm": 0.8529284941515057, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 11661 + }, + { + "epoch": 0.11662, + "grad_norm": 0.7150325438414864, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 11662 + }, + { + "epoch": 0.11663, + "grad_norm": 0.6516355774146884, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 11663 + }, + { + "epoch": 0.11664, + "grad_norm": 0.6410165413648975, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 11664 + }, + { + "epoch": 0.11665, + "grad_norm": 0.6643487625692873, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 11665 + }, + { + "epoch": 0.11666, + "grad_norm": 0.7324064210364144, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 11666 + }, + { + "epoch": 0.11667, + "grad_norm": 0.9646425504933539, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 11667 + }, + { + "epoch": 0.11668, + "grad_norm": 1.3721880860905278, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 11668 + }, + { + "epoch": 0.11669, + "grad_norm": 0.6659074294409868, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 11669 + }, + { + "epoch": 0.1167, + "grad_norm": 0.7318779336296473, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 11670 + }, + { + "epoch": 0.11671, + "grad_norm": 0.7227667586352463, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 11671 + }, + { + "epoch": 0.11672, + "grad_norm": 0.7121967652453894, + "learning_rate": 0.003, + "loss": 4.108, + "step": 11672 + }, + { + "epoch": 0.11673, + "grad_norm": 0.7810832836036794, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 11673 + }, + { + "epoch": 0.11674, + "grad_norm": 0.8147932367854734, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 11674 + }, + { + "epoch": 0.11675, + "grad_norm": 0.8221780637712801, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 11675 + }, + { + "epoch": 0.11676, + "grad_norm": 0.8322387673293494, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 11676 + }, + { + "epoch": 0.11677, + "grad_norm": 1.049561091040258, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 11677 + }, + { + "epoch": 0.11678, + "grad_norm": 1.1285293686759006, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 11678 + }, + { + "epoch": 0.11679, + "grad_norm": 0.7942929439071569, + "learning_rate": 0.003, + "loss": 4.076, + "step": 11679 + }, + { + "epoch": 0.1168, + "grad_norm": 0.8121823955224093, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 11680 + }, + { + "epoch": 0.11681, + "grad_norm": 0.886985617007665, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 11681 + }, + { + "epoch": 0.11682, + "grad_norm": 0.9835906722387846, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 11682 + }, + { + "epoch": 0.11683, + "grad_norm": 0.9742855718416906, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 11683 + }, + { + "epoch": 0.11684, + "grad_norm": 0.9403544667282324, + "learning_rate": 0.003, + "loss": 4.088, + "step": 11684 + }, + { + "epoch": 0.11685, + "grad_norm": 0.8843233261450154, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 11685 + }, + { + "epoch": 0.11686, + "grad_norm": 0.9595432183246029, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 11686 + }, + { + "epoch": 0.11687, + "grad_norm": 0.9177948190718449, + "learning_rate": 0.003, + "loss": 4.101, + "step": 11687 + }, + { + "epoch": 0.11688, + "grad_norm": 0.7568265274966034, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 11688 + }, + { + "epoch": 0.11689, + "grad_norm": 0.6724191688083208, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 11689 + }, + { + "epoch": 0.1169, + "grad_norm": 0.5319571543918213, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 11690 + }, + { + "epoch": 0.11691, + "grad_norm": 0.6091559698255019, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 11691 + }, + { + "epoch": 0.11692, + "grad_norm": 0.5314136756707564, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 11692 + }, + { + "epoch": 0.11693, + "grad_norm": 0.6276209351566823, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 11693 + }, + { + "epoch": 0.11694, + "grad_norm": 0.5809151184858754, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 11694 + }, + { + "epoch": 0.11695, + "grad_norm": 0.5582511698901078, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 11695 + }, + { + "epoch": 0.11696, + "grad_norm": 0.625511431306643, + "learning_rate": 0.003, + "loss": 4.078, + "step": 11696 + }, + { + "epoch": 0.11697, + "grad_norm": 0.7422327835880065, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 11697 + }, + { + "epoch": 0.11698, + "grad_norm": 0.8382422647197152, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 11698 + }, + { + "epoch": 0.11699, + "grad_norm": 0.9000283347934224, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 11699 + }, + { + "epoch": 0.117, + "grad_norm": 1.0768928598652463, + "learning_rate": 0.003, + "loss": 4.081, + "step": 11700 + }, + { + "epoch": 0.11701, + "grad_norm": 1.0652656683545332, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 11701 + }, + { + "epoch": 0.11702, + "grad_norm": 1.057544811604008, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 11702 + }, + { + "epoch": 0.11703, + "grad_norm": 0.9043756476621583, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 11703 + }, + { + "epoch": 0.11704, + "grad_norm": 1.0027396176106307, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 11704 + }, + { + "epoch": 0.11705, + "grad_norm": 0.9744740701229088, + "learning_rate": 0.003, + "loss": 4.076, + "step": 11705 + }, + { + "epoch": 0.11706, + "grad_norm": 0.9664960636352933, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 11706 + }, + { + "epoch": 0.11707, + "grad_norm": 0.8875027587459325, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 11707 + }, + { + "epoch": 0.11708, + "grad_norm": 0.9513123997990248, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 11708 + }, + { + "epoch": 0.11709, + "grad_norm": 1.1466303315616597, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 11709 + }, + { + "epoch": 0.1171, + "grad_norm": 0.7135812377242298, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 11710 + }, + { + "epoch": 0.11711, + "grad_norm": 0.7452097554644643, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 11711 + }, + { + "epoch": 0.11712, + "grad_norm": 0.8389929029307317, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 11712 + }, + { + "epoch": 0.11713, + "grad_norm": 0.987386349524161, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 11713 + }, + { + "epoch": 0.11714, + "grad_norm": 0.9815075689891968, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 11714 + }, + { + "epoch": 0.11715, + "grad_norm": 0.9108941418492904, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 11715 + }, + { + "epoch": 0.11716, + "grad_norm": 0.8388527095547572, + "learning_rate": 0.003, + "loss": 4.056, + "step": 11716 + }, + { + "epoch": 0.11717, + "grad_norm": 0.6655470826961603, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 11717 + }, + { + "epoch": 0.11718, + "grad_norm": 0.6149356984381353, + "learning_rate": 0.003, + "loss": 4.082, + "step": 11718 + }, + { + "epoch": 0.11719, + "grad_norm": 0.6553899157977472, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 11719 + }, + { + "epoch": 0.1172, + "grad_norm": 0.6599660519536159, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 11720 + }, + { + "epoch": 0.11721, + "grad_norm": 0.6891664979129074, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 11721 + }, + { + "epoch": 0.11722, + "grad_norm": 0.7551605902492077, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 11722 + }, + { + "epoch": 0.11723, + "grad_norm": 0.9437203521693749, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 11723 + }, + { + "epoch": 0.11724, + "grad_norm": 1.063671318367107, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 11724 + }, + { + "epoch": 0.11725, + "grad_norm": 0.885530718983848, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 11725 + }, + { + "epoch": 0.11726, + "grad_norm": 0.7116632977684936, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 11726 + }, + { + "epoch": 0.11727, + "grad_norm": 0.6188393045829603, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 11727 + }, + { + "epoch": 0.11728, + "grad_norm": 0.6111188456564599, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 11728 + }, + { + "epoch": 0.11729, + "grad_norm": 0.6625273579626278, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 11729 + }, + { + "epoch": 0.1173, + "grad_norm": 0.7720788705683885, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 11730 + }, + { + "epoch": 0.11731, + "grad_norm": 0.9295753621861833, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 11731 + }, + { + "epoch": 0.11732, + "grad_norm": 1.117967364464593, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 11732 + }, + { + "epoch": 0.11733, + "grad_norm": 0.7599278186135924, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 11733 + }, + { + "epoch": 0.11734, + "grad_norm": 0.6147265724899618, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 11734 + }, + { + "epoch": 0.11735, + "grad_norm": 0.6725320357867157, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 11735 + }, + { + "epoch": 0.11736, + "grad_norm": 0.7456835884216022, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 11736 + }, + { + "epoch": 0.11737, + "grad_norm": 0.812797363085294, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 11737 + }, + { + "epoch": 0.11738, + "grad_norm": 0.8129215074388013, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 11738 + }, + { + "epoch": 0.11739, + "grad_norm": 0.9121844049181134, + "learning_rate": 0.003, + "loss": 4.087, + "step": 11739 + }, + { + "epoch": 0.1174, + "grad_norm": 0.9265831704683285, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 11740 + }, + { + "epoch": 0.11741, + "grad_norm": 0.9117372442258332, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 11741 + }, + { + "epoch": 0.11742, + "grad_norm": 0.8776990703973094, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 11742 + }, + { + "epoch": 0.11743, + "grad_norm": 0.9618020801881524, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 11743 + }, + { + "epoch": 0.11744, + "grad_norm": 0.9316312393728313, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 11744 + }, + { + "epoch": 0.11745, + "grad_norm": 1.031508767645295, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 11745 + }, + { + "epoch": 0.11746, + "grad_norm": 1.0480620764497686, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 11746 + }, + { + "epoch": 0.11747, + "grad_norm": 0.8812457935053363, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 11747 + }, + { + "epoch": 0.11748, + "grad_norm": 0.8446087681633835, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 11748 + }, + { + "epoch": 0.11749, + "grad_norm": 0.8806053106912328, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 11749 + }, + { + "epoch": 0.1175, + "grad_norm": 1.0652412401055025, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 11750 + }, + { + "epoch": 0.11751, + "grad_norm": 1.0154102072008697, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 11751 + }, + { + "epoch": 0.11752, + "grad_norm": 0.8782679887197985, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 11752 + }, + { + "epoch": 0.11753, + "grad_norm": 0.8768908843163372, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 11753 + }, + { + "epoch": 0.11754, + "grad_norm": 0.955884951313414, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 11754 + }, + { + "epoch": 0.11755, + "grad_norm": 0.9232099819063957, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 11755 + }, + { + "epoch": 0.11756, + "grad_norm": 0.8791836223086985, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 11756 + }, + { + "epoch": 0.11757, + "grad_norm": 1.0426667558176133, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 11757 + }, + { + "epoch": 0.11758, + "grad_norm": 1.1146348535077464, + "learning_rate": 0.003, + "loss": 4.1228, + "step": 11758 + }, + { + "epoch": 0.11759, + "grad_norm": 0.7774305248469459, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 11759 + }, + { + "epoch": 0.1176, + "grad_norm": 0.725058397672122, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 11760 + }, + { + "epoch": 0.11761, + "grad_norm": 0.7524803504936709, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 11761 + }, + { + "epoch": 0.11762, + "grad_norm": 0.6014674016359951, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 11762 + }, + { + "epoch": 0.11763, + "grad_norm": 0.6911615119202653, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 11763 + }, + { + "epoch": 0.11764, + "grad_norm": 0.9141987084938712, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 11764 + }, + { + "epoch": 0.11765, + "grad_norm": 1.0528972268151409, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 11765 + }, + { + "epoch": 0.11766, + "grad_norm": 1.0784419150121876, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 11766 + }, + { + "epoch": 0.11767, + "grad_norm": 0.8858386866249308, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 11767 + }, + { + "epoch": 0.11768, + "grad_norm": 0.8405360479720302, + "learning_rate": 0.003, + "loss": 4.084, + "step": 11768 + }, + { + "epoch": 0.11769, + "grad_norm": 0.686081864034834, + "learning_rate": 0.003, + "loss": 4.082, + "step": 11769 + }, + { + "epoch": 0.1177, + "grad_norm": 0.5832102646409766, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 11770 + }, + { + "epoch": 0.11771, + "grad_norm": 0.5616671018869388, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 11771 + }, + { + "epoch": 0.11772, + "grad_norm": 0.5590360587542899, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 11772 + }, + { + "epoch": 0.11773, + "grad_norm": 0.5679055088444243, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 11773 + }, + { + "epoch": 0.11774, + "grad_norm": 0.4680771136119359, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 11774 + }, + { + "epoch": 0.11775, + "grad_norm": 0.5092729926954808, + "learning_rate": 0.003, + "loss": 4.067, + "step": 11775 + }, + { + "epoch": 0.11776, + "grad_norm": 0.5424922004773088, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 11776 + }, + { + "epoch": 0.11777, + "grad_norm": 0.5936151804472162, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 11777 + }, + { + "epoch": 0.11778, + "grad_norm": 0.6732649637257471, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 11778 + }, + { + "epoch": 0.11779, + "grad_norm": 0.7690416533056713, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 11779 + }, + { + "epoch": 0.1178, + "grad_norm": 0.8353140580453683, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 11780 + }, + { + "epoch": 0.11781, + "grad_norm": 0.8935837715211247, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 11781 + }, + { + "epoch": 0.11782, + "grad_norm": 1.0531796991636666, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 11782 + }, + { + "epoch": 0.11783, + "grad_norm": 1.079224226296788, + "learning_rate": 0.003, + "loss": 4.1119, + "step": 11783 + }, + { + "epoch": 0.11784, + "grad_norm": 0.8356900460632927, + "learning_rate": 0.003, + "loss": 4.068, + "step": 11784 + }, + { + "epoch": 0.11785, + "grad_norm": 0.8084761278959023, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 11785 + }, + { + "epoch": 0.11786, + "grad_norm": 0.82914704043006, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 11786 + }, + { + "epoch": 0.11787, + "grad_norm": 0.6775644295176407, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 11787 + }, + { + "epoch": 0.11788, + "grad_norm": 0.6655348429091607, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 11788 + }, + { + "epoch": 0.11789, + "grad_norm": 0.6407300835634582, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 11789 + }, + { + "epoch": 0.1179, + "grad_norm": 0.7638731149124559, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 11790 + }, + { + "epoch": 0.11791, + "grad_norm": 1.0282178932707131, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 11791 + }, + { + "epoch": 0.11792, + "grad_norm": 1.264008956440248, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 11792 + }, + { + "epoch": 0.11793, + "grad_norm": 0.6519522242962901, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 11793 + }, + { + "epoch": 0.11794, + "grad_norm": 0.5935865323698845, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 11794 + }, + { + "epoch": 0.11795, + "grad_norm": 0.6666966012390865, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 11795 + }, + { + "epoch": 0.11796, + "grad_norm": 0.8503744688629382, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 11796 + }, + { + "epoch": 0.11797, + "grad_norm": 1.0816103202425806, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 11797 + }, + { + "epoch": 0.11798, + "grad_norm": 0.8486292640258042, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 11798 + }, + { + "epoch": 0.11799, + "grad_norm": 0.774259880796334, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 11799 + }, + { + "epoch": 0.118, + "grad_norm": 0.6827970488744037, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 11800 + }, + { + "epoch": 0.11801, + "grad_norm": 0.680022168428237, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 11801 + }, + { + "epoch": 0.11802, + "grad_norm": 0.7608902817432555, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 11802 + }, + { + "epoch": 0.11803, + "grad_norm": 0.8771018181150345, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 11803 + }, + { + "epoch": 0.11804, + "grad_norm": 1.0895264488416811, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 11804 + }, + { + "epoch": 0.11805, + "grad_norm": 1.1713998207842216, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 11805 + }, + { + "epoch": 0.11806, + "grad_norm": 0.9559247722625149, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 11806 + }, + { + "epoch": 0.11807, + "grad_norm": 1.0176177339184564, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 11807 + }, + { + "epoch": 0.11808, + "grad_norm": 0.9828894080361346, + "learning_rate": 0.003, + "loss": 4.071, + "step": 11808 + }, + { + "epoch": 0.11809, + "grad_norm": 0.9078709897541808, + "learning_rate": 0.003, + "loss": 4.101, + "step": 11809 + }, + { + "epoch": 0.1181, + "grad_norm": 0.7298296561198457, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 11810 + }, + { + "epoch": 0.11811, + "grad_norm": 0.7136721860676385, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 11811 + }, + { + "epoch": 0.11812, + "grad_norm": 0.7120421825821814, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 11812 + }, + { + "epoch": 0.11813, + "grad_norm": 0.7436975761264889, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 11813 + }, + { + "epoch": 0.11814, + "grad_norm": 0.6765075476982464, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 11814 + }, + { + "epoch": 0.11815, + "grad_norm": 0.7008626110877016, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 11815 + }, + { + "epoch": 0.11816, + "grad_norm": 0.7690086118905748, + "learning_rate": 0.003, + "loss": 4.058, + "step": 11816 + }, + { + "epoch": 0.11817, + "grad_norm": 0.8286856888738373, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 11817 + }, + { + "epoch": 0.11818, + "grad_norm": 0.833350887245866, + "learning_rate": 0.003, + "loss": 4.082, + "step": 11818 + }, + { + "epoch": 0.11819, + "grad_norm": 0.8133365071024339, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 11819 + }, + { + "epoch": 0.1182, + "grad_norm": 0.8950574441093196, + "learning_rate": 0.003, + "loss": 4.09, + "step": 11820 + }, + { + "epoch": 0.11821, + "grad_norm": 0.8314993561563281, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 11821 + }, + { + "epoch": 0.11822, + "grad_norm": 0.879384195427867, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 11822 + }, + { + "epoch": 0.11823, + "grad_norm": 0.8712791151137181, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 11823 + }, + { + "epoch": 0.11824, + "grad_norm": 0.7946335403095336, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 11824 + }, + { + "epoch": 0.11825, + "grad_norm": 0.9251946099724122, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 11825 + }, + { + "epoch": 0.11826, + "grad_norm": 0.9089433872644047, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 11826 + }, + { + "epoch": 0.11827, + "grad_norm": 0.960148991783686, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 11827 + }, + { + "epoch": 0.11828, + "grad_norm": 0.9553461678044473, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 11828 + }, + { + "epoch": 0.11829, + "grad_norm": 0.793056431276092, + "learning_rate": 0.003, + "loss": 4.106, + "step": 11829 + }, + { + "epoch": 0.1183, + "grad_norm": 0.7258879341191068, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 11830 + }, + { + "epoch": 0.11831, + "grad_norm": 0.7398380601017597, + "learning_rate": 0.003, + "loss": 4.086, + "step": 11831 + }, + { + "epoch": 0.11832, + "grad_norm": 0.6754318588653985, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 11832 + }, + { + "epoch": 0.11833, + "grad_norm": 0.6170706589009188, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 11833 + }, + { + "epoch": 0.11834, + "grad_norm": 0.646106463233276, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 11834 + }, + { + "epoch": 0.11835, + "grad_norm": 0.8762103450992815, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 11835 + }, + { + "epoch": 0.11836, + "grad_norm": 1.2776762081665294, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 11836 + }, + { + "epoch": 0.11837, + "grad_norm": 0.8108701163884705, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 11837 + }, + { + "epoch": 0.11838, + "grad_norm": 0.6940338943383062, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 11838 + }, + { + "epoch": 0.11839, + "grad_norm": 0.7901864807977117, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 11839 + }, + { + "epoch": 0.1184, + "grad_norm": 0.7807961097446459, + "learning_rate": 0.003, + "loss": 4.07, + "step": 11840 + }, + { + "epoch": 0.11841, + "grad_norm": 0.9471463996011317, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 11841 + }, + { + "epoch": 0.11842, + "grad_norm": 0.8478233584393488, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 11842 + }, + { + "epoch": 0.11843, + "grad_norm": 0.763512806199942, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 11843 + }, + { + "epoch": 0.11844, + "grad_norm": 0.6452078727657857, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 11844 + }, + { + "epoch": 0.11845, + "grad_norm": 0.6445995030373571, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 11845 + }, + { + "epoch": 0.11846, + "grad_norm": 0.742374167214842, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 11846 + }, + { + "epoch": 0.11847, + "grad_norm": 0.7351815640255105, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 11847 + }, + { + "epoch": 0.11848, + "grad_norm": 0.8196365500471058, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 11848 + }, + { + "epoch": 0.11849, + "grad_norm": 0.831588584012136, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 11849 + }, + { + "epoch": 0.1185, + "grad_norm": 0.8433679354730954, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 11850 + }, + { + "epoch": 0.11851, + "grad_norm": 0.9137556915989256, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 11851 + }, + { + "epoch": 0.11852, + "grad_norm": 1.0697839356450594, + "learning_rate": 0.003, + "loss": 4.077, + "step": 11852 + }, + { + "epoch": 0.11853, + "grad_norm": 1.049866632695257, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 11853 + }, + { + "epoch": 0.11854, + "grad_norm": 0.8671901419952202, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 11854 + }, + { + "epoch": 0.11855, + "grad_norm": 0.9208510989321324, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 11855 + }, + { + "epoch": 0.11856, + "grad_norm": 0.8926572862378708, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 11856 + }, + { + "epoch": 0.11857, + "grad_norm": 0.8433740093332531, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 11857 + }, + { + "epoch": 0.11858, + "grad_norm": 0.8254144915988062, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 11858 + }, + { + "epoch": 0.11859, + "grad_norm": 0.7926083472119856, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 11859 + }, + { + "epoch": 0.1186, + "grad_norm": 0.794748225166051, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 11860 + }, + { + "epoch": 0.11861, + "grad_norm": 0.8684563379643798, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 11861 + }, + { + "epoch": 0.11862, + "grad_norm": 0.8851848538484391, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 11862 + }, + { + "epoch": 0.11863, + "grad_norm": 0.73032091671914, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 11863 + }, + { + "epoch": 0.11864, + "grad_norm": 0.6668427993843925, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 11864 + }, + { + "epoch": 0.11865, + "grad_norm": 0.6165145527182839, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 11865 + }, + { + "epoch": 0.11866, + "grad_norm": 0.612055241418331, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 11866 + }, + { + "epoch": 0.11867, + "grad_norm": 0.7059845049393006, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 11867 + }, + { + "epoch": 0.11868, + "grad_norm": 0.7560768778001865, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 11868 + }, + { + "epoch": 0.11869, + "grad_norm": 0.8673186931348383, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 11869 + }, + { + "epoch": 0.1187, + "grad_norm": 1.010879366295258, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 11870 + }, + { + "epoch": 0.11871, + "grad_norm": 1.0209959618047066, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 11871 + }, + { + "epoch": 0.11872, + "grad_norm": 0.9860088323911688, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 11872 + }, + { + "epoch": 0.11873, + "grad_norm": 1.1659158688060458, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 11873 + }, + { + "epoch": 0.11874, + "grad_norm": 0.8094261387221169, + "learning_rate": 0.003, + "loss": 4.042, + "step": 11874 + }, + { + "epoch": 0.11875, + "grad_norm": 0.6335448424270009, + "learning_rate": 0.003, + "loss": 4.08, + "step": 11875 + }, + { + "epoch": 0.11876, + "grad_norm": 0.6773731825309582, + "learning_rate": 0.003, + "loss": 4.052, + "step": 11876 + }, + { + "epoch": 0.11877, + "grad_norm": 0.6984872535116806, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 11877 + }, + { + "epoch": 0.11878, + "grad_norm": 0.7251659690338177, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 11878 + }, + { + "epoch": 0.11879, + "grad_norm": 0.7588124610807487, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 11879 + }, + { + "epoch": 0.1188, + "grad_norm": 0.9426291501868473, + "learning_rate": 0.003, + "loss": 4.084, + "step": 11880 + }, + { + "epoch": 0.11881, + "grad_norm": 0.8549154743151803, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 11881 + }, + { + "epoch": 0.11882, + "grad_norm": 0.7486076080102272, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 11882 + }, + { + "epoch": 0.11883, + "grad_norm": 0.7005995077416897, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 11883 + }, + { + "epoch": 0.11884, + "grad_norm": 0.6614383598281759, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 11884 + }, + { + "epoch": 0.11885, + "grad_norm": 0.7390909664238483, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 11885 + }, + { + "epoch": 0.11886, + "grad_norm": 0.8006965897045121, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 11886 + }, + { + "epoch": 0.11887, + "grad_norm": 0.7146342820208608, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 11887 + }, + { + "epoch": 0.11888, + "grad_norm": 0.6767998942626821, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 11888 + }, + { + "epoch": 0.11889, + "grad_norm": 0.7800125822291902, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 11889 + }, + { + "epoch": 0.1189, + "grad_norm": 0.8777806739707887, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 11890 + }, + { + "epoch": 0.11891, + "grad_norm": 0.9746936424903314, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 11891 + }, + { + "epoch": 0.11892, + "grad_norm": 1.1884031352290723, + "learning_rate": 0.003, + "loss": 4.09, + "step": 11892 + }, + { + "epoch": 0.11893, + "grad_norm": 1.13475999737239, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 11893 + }, + { + "epoch": 0.11894, + "grad_norm": 0.8967699394773493, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 11894 + }, + { + "epoch": 0.11895, + "grad_norm": 0.8007111264651628, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 11895 + }, + { + "epoch": 0.11896, + "grad_norm": 0.7307540515461585, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 11896 + }, + { + "epoch": 0.11897, + "grad_norm": 0.7815929432354917, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 11897 + }, + { + "epoch": 0.11898, + "grad_norm": 0.7385538729204432, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 11898 + }, + { + "epoch": 0.11899, + "grad_norm": 0.6650532955074363, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 11899 + }, + { + "epoch": 0.119, + "grad_norm": 0.6876031187179125, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 11900 + }, + { + "epoch": 0.11901, + "grad_norm": 0.6693901471853071, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 11901 + }, + { + "epoch": 0.11902, + "grad_norm": 0.7450683537894556, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 11902 + }, + { + "epoch": 0.11903, + "grad_norm": 0.9583378538144384, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 11903 + }, + { + "epoch": 0.11904, + "grad_norm": 1.2031401004775466, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 11904 + }, + { + "epoch": 0.11905, + "grad_norm": 0.6708910129080695, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 11905 + }, + { + "epoch": 0.11906, + "grad_norm": 0.6088762788761715, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 11906 + }, + { + "epoch": 0.11907, + "grad_norm": 0.6657607666816874, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 11907 + }, + { + "epoch": 0.11908, + "grad_norm": 0.7841564673513367, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 11908 + }, + { + "epoch": 0.11909, + "grad_norm": 0.9021268502553939, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 11909 + }, + { + "epoch": 0.1191, + "grad_norm": 0.916707108404376, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 11910 + }, + { + "epoch": 0.11911, + "grad_norm": 0.8246149905091653, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 11911 + }, + { + "epoch": 0.11912, + "grad_norm": 0.7323629401540885, + "learning_rate": 0.003, + "loss": 4.1161, + "step": 11912 + }, + { + "epoch": 0.11913, + "grad_norm": 0.9061360685992332, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 11913 + }, + { + "epoch": 0.11914, + "grad_norm": 1.0343836872001413, + "learning_rate": 0.003, + "loss": 4.111, + "step": 11914 + }, + { + "epoch": 0.11915, + "grad_norm": 0.997844312316651, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 11915 + }, + { + "epoch": 0.11916, + "grad_norm": 1.1217834060028307, + "learning_rate": 0.003, + "loss": 4.116, + "step": 11916 + }, + { + "epoch": 0.11917, + "grad_norm": 0.9155676472690304, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 11917 + }, + { + "epoch": 0.11918, + "grad_norm": 0.8305774438930578, + "learning_rate": 0.003, + "loss": 4.091, + "step": 11918 + }, + { + "epoch": 0.11919, + "grad_norm": 0.692585947841226, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 11919 + }, + { + "epoch": 0.1192, + "grad_norm": 0.615359817362125, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 11920 + }, + { + "epoch": 0.11921, + "grad_norm": 0.6550229673560121, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 11921 + }, + { + "epoch": 0.11922, + "grad_norm": 0.7481644276266792, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 11922 + }, + { + "epoch": 0.11923, + "grad_norm": 0.9216830062652961, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 11923 + }, + { + "epoch": 0.11924, + "grad_norm": 1.148633737473478, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 11924 + }, + { + "epoch": 0.11925, + "grad_norm": 0.88659831115291, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 11925 + }, + { + "epoch": 0.11926, + "grad_norm": 0.8069585267153818, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 11926 + }, + { + "epoch": 0.11927, + "grad_norm": 0.7347630623585638, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 11927 + }, + { + "epoch": 0.11928, + "grad_norm": 0.7050032718999796, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 11928 + }, + { + "epoch": 0.11929, + "grad_norm": 0.7815451331357121, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 11929 + }, + { + "epoch": 0.1193, + "grad_norm": 0.6849344807101223, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 11930 + }, + { + "epoch": 0.11931, + "grad_norm": 0.679361133468356, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 11931 + }, + { + "epoch": 0.11932, + "grad_norm": 0.8450698013980862, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 11932 + }, + { + "epoch": 0.11933, + "grad_norm": 0.8835274823568191, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 11933 + }, + { + "epoch": 0.11934, + "grad_norm": 0.8690045370066336, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 11934 + }, + { + "epoch": 0.11935, + "grad_norm": 1.0116570673577854, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 11935 + }, + { + "epoch": 0.11936, + "grad_norm": 1.0767742621445415, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 11936 + }, + { + "epoch": 0.11937, + "grad_norm": 0.9916078684299126, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 11937 + }, + { + "epoch": 0.11938, + "grad_norm": 0.9222300875660133, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 11938 + }, + { + "epoch": 0.11939, + "grad_norm": 0.8135030956349809, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 11939 + }, + { + "epoch": 0.1194, + "grad_norm": 0.8370228867878781, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 11940 + }, + { + "epoch": 0.11941, + "grad_norm": 0.8571547503694238, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 11941 + }, + { + "epoch": 0.11942, + "grad_norm": 0.805516408836029, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 11942 + }, + { + "epoch": 0.11943, + "grad_norm": 0.8193503171831941, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 11943 + }, + { + "epoch": 0.11944, + "grad_norm": 0.8006684034359407, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 11944 + }, + { + "epoch": 0.11945, + "grad_norm": 0.76548572031874, + "learning_rate": 0.003, + "loss": 4.052, + "step": 11945 + }, + { + "epoch": 0.11946, + "grad_norm": 0.8012017184250187, + "learning_rate": 0.003, + "loss": 4.067, + "step": 11946 + }, + { + "epoch": 0.11947, + "grad_norm": 0.7268659883443598, + "learning_rate": 0.003, + "loss": 4.095, + "step": 11947 + }, + { + "epoch": 0.11948, + "grad_norm": 0.748838960756178, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 11948 + }, + { + "epoch": 0.11949, + "grad_norm": 0.859028005033597, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 11949 + }, + { + "epoch": 0.1195, + "grad_norm": 0.9373645772108132, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 11950 + }, + { + "epoch": 0.11951, + "grad_norm": 1.2337400168375312, + "learning_rate": 0.003, + "loss": 4.095, + "step": 11951 + }, + { + "epoch": 0.11952, + "grad_norm": 0.9365772417221291, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 11952 + }, + { + "epoch": 0.11953, + "grad_norm": 0.8691607276398206, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 11953 + }, + { + "epoch": 0.11954, + "grad_norm": 0.8045361104724315, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 11954 + }, + { + "epoch": 0.11955, + "grad_norm": 0.822560519294801, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 11955 + }, + { + "epoch": 0.11956, + "grad_norm": 0.9071708859836859, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 11956 + }, + { + "epoch": 0.11957, + "grad_norm": 1.0317809037825372, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 11957 + }, + { + "epoch": 0.11958, + "grad_norm": 1.0457569164754317, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 11958 + }, + { + "epoch": 0.11959, + "grad_norm": 0.9491594661548564, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 11959 + }, + { + "epoch": 0.1196, + "grad_norm": 0.8583938821808423, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 11960 + }, + { + "epoch": 0.11961, + "grad_norm": 0.7415940680604313, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 11961 + }, + { + "epoch": 0.11962, + "grad_norm": 0.6353780517582217, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 11962 + }, + { + "epoch": 0.11963, + "grad_norm": 0.7983793537208222, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 11963 + }, + { + "epoch": 0.11964, + "grad_norm": 0.7886334581246542, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 11964 + }, + { + "epoch": 0.11965, + "grad_norm": 0.718014453801322, + "learning_rate": 0.003, + "loss": 4.086, + "step": 11965 + }, + { + "epoch": 0.11966, + "grad_norm": 0.7093182418337178, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 11966 + }, + { + "epoch": 0.11967, + "grad_norm": 0.6216164077811807, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 11967 + }, + { + "epoch": 0.11968, + "grad_norm": 0.5674808291344278, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 11968 + }, + { + "epoch": 0.11969, + "grad_norm": 0.5220969467595893, + "learning_rate": 0.003, + "loss": 4.087, + "step": 11969 + }, + { + "epoch": 0.1197, + "grad_norm": 0.5201727120164347, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 11970 + }, + { + "epoch": 0.11971, + "grad_norm": 0.45658338264133513, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 11971 + }, + { + "epoch": 0.11972, + "grad_norm": 0.5541459167779041, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 11972 + }, + { + "epoch": 0.11973, + "grad_norm": 0.6364115851727812, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 11973 + }, + { + "epoch": 0.11974, + "grad_norm": 0.7923773553003368, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 11974 + }, + { + "epoch": 0.11975, + "grad_norm": 1.119949647941313, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 11975 + }, + { + "epoch": 0.11976, + "grad_norm": 1.23207415112035, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 11976 + }, + { + "epoch": 0.11977, + "grad_norm": 0.6590221214615162, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 11977 + }, + { + "epoch": 0.11978, + "grad_norm": 0.6948614605624547, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 11978 + }, + { + "epoch": 0.11979, + "grad_norm": 0.8847266907483391, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 11979 + }, + { + "epoch": 0.1198, + "grad_norm": 0.9312694274233432, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 11980 + }, + { + "epoch": 0.11981, + "grad_norm": 0.9461444011302217, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 11981 + }, + { + "epoch": 0.11982, + "grad_norm": 0.8758913587065924, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 11982 + }, + { + "epoch": 0.11983, + "grad_norm": 0.8760617492377928, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 11983 + }, + { + "epoch": 0.11984, + "grad_norm": 0.7243109034421162, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 11984 + }, + { + "epoch": 0.11985, + "grad_norm": 0.7004600396377721, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 11985 + }, + { + "epoch": 0.11986, + "grad_norm": 0.7221015934897439, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 11986 + }, + { + "epoch": 0.11987, + "grad_norm": 0.7871829593065788, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 11987 + }, + { + "epoch": 0.11988, + "grad_norm": 0.7075770816654835, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 11988 + }, + { + "epoch": 0.11989, + "grad_norm": 0.7606136914808911, + "learning_rate": 0.003, + "loss": 4.082, + "step": 11989 + }, + { + "epoch": 0.1199, + "grad_norm": 0.9010988151799043, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 11990 + }, + { + "epoch": 0.11991, + "grad_norm": 0.9786531877527461, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 11991 + }, + { + "epoch": 0.11992, + "grad_norm": 1.0463234459105692, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 11992 + }, + { + "epoch": 0.11993, + "grad_norm": 0.9405513356826912, + "learning_rate": 0.003, + "loss": 4.065, + "step": 11993 + }, + { + "epoch": 0.11994, + "grad_norm": 0.9646495068141073, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 11994 + }, + { + "epoch": 0.11995, + "grad_norm": 1.0309215622098242, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 11995 + }, + { + "epoch": 0.11996, + "grad_norm": 0.9241477306836697, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 11996 + }, + { + "epoch": 0.11997, + "grad_norm": 0.8097824344911, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 11997 + }, + { + "epoch": 0.11998, + "grad_norm": 0.7067736238117723, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 11998 + }, + { + "epoch": 0.11999, + "grad_norm": 0.6538639483861481, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 11999 + }, + { + "epoch": 0.12, + "grad_norm": 0.7659243077400497, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 12000 + }, + { + "epoch": 0.12001, + "grad_norm": 0.8023521521090117, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 12001 + }, + { + "epoch": 0.12002, + "grad_norm": 0.961242206121727, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 12002 + }, + { + "epoch": 0.12003, + "grad_norm": 1.2526177653248893, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 12003 + }, + { + "epoch": 0.12004, + "grad_norm": 0.8410304975750107, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 12004 + }, + { + "epoch": 0.12005, + "grad_norm": 0.7279790520142565, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 12005 + }, + { + "epoch": 0.12006, + "grad_norm": 0.7285212920146146, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 12006 + }, + { + "epoch": 0.12007, + "grad_norm": 0.6647624461558206, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 12007 + }, + { + "epoch": 0.12008, + "grad_norm": 0.6419513250348899, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 12008 + }, + { + "epoch": 0.12009, + "grad_norm": 0.642549189417902, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 12009 + }, + { + "epoch": 0.1201, + "grad_norm": 0.7023091498881583, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 12010 + }, + { + "epoch": 0.12011, + "grad_norm": 0.7171302422861198, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 12011 + }, + { + "epoch": 0.12012, + "grad_norm": 0.7942440278284121, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 12012 + }, + { + "epoch": 0.12013, + "grad_norm": 0.8896477823603075, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 12013 + }, + { + "epoch": 0.12014, + "grad_norm": 0.8837668069948614, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 12014 + }, + { + "epoch": 0.12015, + "grad_norm": 0.8435689467729217, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 12015 + }, + { + "epoch": 0.12016, + "grad_norm": 0.9212366189235262, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 12016 + }, + { + "epoch": 0.12017, + "grad_norm": 0.9053821425733813, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 12017 + }, + { + "epoch": 0.12018, + "grad_norm": 1.070929549236012, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 12018 + }, + { + "epoch": 0.12019, + "grad_norm": 1.1632855793109629, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 12019 + }, + { + "epoch": 0.1202, + "grad_norm": 0.9689388366189522, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 12020 + }, + { + "epoch": 0.12021, + "grad_norm": 0.8805816300549908, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 12021 + }, + { + "epoch": 0.12022, + "grad_norm": 0.9393700862182391, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 12022 + }, + { + "epoch": 0.12023, + "grad_norm": 0.9408800937079208, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 12023 + }, + { + "epoch": 0.12024, + "grad_norm": 0.8780239980937175, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 12024 + }, + { + "epoch": 0.12025, + "grad_norm": 0.8283146623762387, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 12025 + }, + { + "epoch": 0.12026, + "grad_norm": 0.8358219955732257, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 12026 + }, + { + "epoch": 0.12027, + "grad_norm": 0.6763683484972164, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 12027 + }, + { + "epoch": 0.12028, + "grad_norm": 0.7241408294923761, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 12028 + }, + { + "epoch": 0.12029, + "grad_norm": 0.9977567222444976, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 12029 + }, + { + "epoch": 0.1203, + "grad_norm": 1.4343501765745075, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 12030 + }, + { + "epoch": 0.12031, + "grad_norm": 0.5934750778806325, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 12031 + }, + { + "epoch": 0.12032, + "grad_norm": 0.7317713488446722, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 12032 + }, + { + "epoch": 0.12033, + "grad_norm": 0.8453119025974559, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 12033 + }, + { + "epoch": 0.12034, + "grad_norm": 0.8886814508473693, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 12034 + }, + { + "epoch": 0.12035, + "grad_norm": 0.9104362807341917, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 12035 + }, + { + "epoch": 0.12036, + "grad_norm": 0.9600326587977933, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 12036 + }, + { + "epoch": 0.12037, + "grad_norm": 0.8723262804067118, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 12037 + }, + { + "epoch": 0.12038, + "grad_norm": 0.8081761707745708, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 12038 + }, + { + "epoch": 0.12039, + "grad_norm": 0.7495659446732845, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 12039 + }, + { + "epoch": 0.1204, + "grad_norm": 0.7120487632255719, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 12040 + }, + { + "epoch": 0.12041, + "grad_norm": 0.8530661194106129, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 12041 + }, + { + "epoch": 0.12042, + "grad_norm": 0.882956070651322, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 12042 + }, + { + "epoch": 0.12043, + "grad_norm": 0.8017395404248646, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 12043 + }, + { + "epoch": 0.12044, + "grad_norm": 0.6723683416837811, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 12044 + }, + { + "epoch": 0.12045, + "grad_norm": 0.6584653483199643, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 12045 + }, + { + "epoch": 0.12046, + "grad_norm": 0.7035025023856035, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 12046 + }, + { + "epoch": 0.12047, + "grad_norm": 0.8530059671037054, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 12047 + }, + { + "epoch": 0.12048, + "grad_norm": 1.0610477177598243, + "learning_rate": 0.003, + "loss": 4.091, + "step": 12048 + }, + { + "epoch": 0.12049, + "grad_norm": 0.925017658633742, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 12049 + }, + { + "epoch": 0.1205, + "grad_norm": 0.9545499424695932, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 12050 + }, + { + "epoch": 0.12051, + "grad_norm": 0.9724560908706467, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 12051 + }, + { + "epoch": 0.12052, + "grad_norm": 1.0799923024937357, + "learning_rate": 0.003, + "loss": 4.106, + "step": 12052 + }, + { + "epoch": 0.12053, + "grad_norm": 0.9852278779342245, + "learning_rate": 0.003, + "loss": 4.097, + "step": 12053 + }, + { + "epoch": 0.12054, + "grad_norm": 1.1082986645884205, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 12054 + }, + { + "epoch": 0.12055, + "grad_norm": 0.9782150774884669, + "learning_rate": 0.003, + "loss": 4.11, + "step": 12055 + }, + { + "epoch": 0.12056, + "grad_norm": 0.981583538098505, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 12056 + }, + { + "epoch": 0.12057, + "grad_norm": 0.9000013529274676, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 12057 + }, + { + "epoch": 0.12058, + "grad_norm": 0.8248975117610136, + "learning_rate": 0.003, + "loss": 4.1152, + "step": 12058 + }, + { + "epoch": 0.12059, + "grad_norm": 0.7316550518767464, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 12059 + }, + { + "epoch": 0.1206, + "grad_norm": 0.8694276237701707, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 12060 + }, + { + "epoch": 0.12061, + "grad_norm": 0.938263264905831, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 12061 + }, + { + "epoch": 0.12062, + "grad_norm": 1.0585572997712702, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 12062 + }, + { + "epoch": 0.12063, + "grad_norm": 1.0263333261586491, + "learning_rate": 0.003, + "loss": 4.1, + "step": 12063 + }, + { + "epoch": 0.12064, + "grad_norm": 0.9744024691980951, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 12064 + }, + { + "epoch": 0.12065, + "grad_norm": 0.9723547264958703, + "learning_rate": 0.003, + "loss": 4.083, + "step": 12065 + }, + { + "epoch": 0.12066, + "grad_norm": 0.8621056775718198, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 12066 + }, + { + "epoch": 0.12067, + "grad_norm": 0.760732961962908, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 12067 + }, + { + "epoch": 0.12068, + "grad_norm": 0.7224923416076042, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 12068 + }, + { + "epoch": 0.12069, + "grad_norm": 0.6985051056644312, + "learning_rate": 0.003, + "loss": 4.075, + "step": 12069 + }, + { + "epoch": 0.1207, + "grad_norm": 0.6892685380871093, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 12070 + }, + { + "epoch": 0.12071, + "grad_norm": 0.7753502411874906, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 12071 + }, + { + "epoch": 0.12072, + "grad_norm": 1.0273084993720307, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 12072 + }, + { + "epoch": 0.12073, + "grad_norm": 1.1702943961349295, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 12073 + }, + { + "epoch": 0.12074, + "grad_norm": 0.7291161680358292, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 12074 + }, + { + "epoch": 0.12075, + "grad_norm": 0.6377229522581281, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 12075 + }, + { + "epoch": 0.12076, + "grad_norm": 0.655318568436041, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 12076 + }, + { + "epoch": 0.12077, + "grad_norm": 0.5461348060545833, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 12077 + }, + { + "epoch": 0.12078, + "grad_norm": 0.5313314121879531, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 12078 + }, + { + "epoch": 0.12079, + "grad_norm": 0.45366667880293565, + "learning_rate": 0.003, + "loss": 4.062, + "step": 12079 + }, + { + "epoch": 0.1208, + "grad_norm": 0.4978555488964222, + "learning_rate": 0.003, + "loss": 4.079, + "step": 12080 + }, + { + "epoch": 0.12081, + "grad_norm": 0.5435159491187604, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 12081 + }, + { + "epoch": 0.12082, + "grad_norm": 0.7004071495504851, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 12082 + }, + { + "epoch": 0.12083, + "grad_norm": 0.8819757494392108, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 12083 + }, + { + "epoch": 0.12084, + "grad_norm": 0.9984371636379986, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 12084 + }, + { + "epoch": 0.12085, + "grad_norm": 0.9377941167196149, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 12085 + }, + { + "epoch": 0.12086, + "grad_norm": 0.777402896704748, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 12086 + }, + { + "epoch": 0.12087, + "grad_norm": 0.6768328289763865, + "learning_rate": 0.003, + "loss": 4.1068, + "step": 12087 + }, + { + "epoch": 0.12088, + "grad_norm": 0.6550784747702699, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 12088 + }, + { + "epoch": 0.12089, + "grad_norm": 0.6799046805949871, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 12089 + }, + { + "epoch": 0.1209, + "grad_norm": 0.6888254527740507, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 12090 + }, + { + "epoch": 0.12091, + "grad_norm": 0.6698631300665974, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 12091 + }, + { + "epoch": 0.12092, + "grad_norm": 0.6758036757087778, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 12092 + }, + { + "epoch": 0.12093, + "grad_norm": 0.7516931246977546, + "learning_rate": 0.003, + "loss": 4.085, + "step": 12093 + }, + { + "epoch": 0.12094, + "grad_norm": 0.8660913984037959, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 12094 + }, + { + "epoch": 0.12095, + "grad_norm": 1.148225611386786, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 12095 + }, + { + "epoch": 0.12096, + "grad_norm": 0.9430544649654374, + "learning_rate": 0.003, + "loss": 4.07, + "step": 12096 + }, + { + "epoch": 0.12097, + "grad_norm": 0.8004448350274224, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 12097 + }, + { + "epoch": 0.12098, + "grad_norm": 0.8437588482083355, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 12098 + }, + { + "epoch": 0.12099, + "grad_norm": 0.9654506646429969, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 12099 + }, + { + "epoch": 0.121, + "grad_norm": 0.8052346106708665, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 12100 + }, + { + "epoch": 0.12101, + "grad_norm": 0.7761954135951914, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 12101 + }, + { + "epoch": 0.12102, + "grad_norm": 0.7865471443949508, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 12102 + }, + { + "epoch": 0.12103, + "grad_norm": 0.908020254613268, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 12103 + }, + { + "epoch": 0.12104, + "grad_norm": 0.9714484820971058, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 12104 + }, + { + "epoch": 0.12105, + "grad_norm": 1.1516958837405888, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 12105 + }, + { + "epoch": 0.12106, + "grad_norm": 0.956438837762759, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 12106 + }, + { + "epoch": 0.12107, + "grad_norm": 0.9605465015934317, + "learning_rate": 0.003, + "loss": 4.113, + "step": 12107 + }, + { + "epoch": 0.12108, + "grad_norm": 1.0015242903697923, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 12108 + }, + { + "epoch": 0.12109, + "grad_norm": 0.9656350030878798, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 12109 + }, + { + "epoch": 0.1211, + "grad_norm": 1.0208554246207817, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 12110 + }, + { + "epoch": 0.12111, + "grad_norm": 1.045212071755535, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 12111 + }, + { + "epoch": 0.12112, + "grad_norm": 0.9609127200304951, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 12112 + }, + { + "epoch": 0.12113, + "grad_norm": 0.9315381484230759, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 12113 + }, + { + "epoch": 0.12114, + "grad_norm": 1.1086423634112808, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 12114 + }, + { + "epoch": 0.12115, + "grad_norm": 0.9523300917629679, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 12115 + }, + { + "epoch": 0.12116, + "grad_norm": 0.8687541089117045, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 12116 + }, + { + "epoch": 0.12117, + "grad_norm": 0.7873255227446385, + "learning_rate": 0.003, + "loss": 4.1131, + "step": 12117 + }, + { + "epoch": 0.12118, + "grad_norm": 0.7821982607193855, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 12118 + }, + { + "epoch": 0.12119, + "grad_norm": 0.7616462691624264, + "learning_rate": 0.003, + "loss": 4.106, + "step": 12119 + }, + { + "epoch": 0.1212, + "grad_norm": 0.707120879825376, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 12120 + }, + { + "epoch": 0.12121, + "grad_norm": 0.8449356180095963, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 12121 + }, + { + "epoch": 0.12122, + "grad_norm": 0.891037945284292, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 12122 + }, + { + "epoch": 0.12123, + "grad_norm": 0.8461260410105871, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 12123 + }, + { + "epoch": 0.12124, + "grad_norm": 0.9045354603549911, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 12124 + }, + { + "epoch": 0.12125, + "grad_norm": 1.0010016238193329, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 12125 + }, + { + "epoch": 0.12126, + "grad_norm": 0.9632668167060271, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 12126 + }, + { + "epoch": 0.12127, + "grad_norm": 0.9500807246486168, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 12127 + }, + { + "epoch": 0.12128, + "grad_norm": 0.8874953984881597, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 12128 + }, + { + "epoch": 0.12129, + "grad_norm": 0.7891377567378397, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 12129 + }, + { + "epoch": 0.1213, + "grad_norm": 0.7152733415442877, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 12130 + }, + { + "epoch": 0.12131, + "grad_norm": 0.7280311154787182, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 12131 + }, + { + "epoch": 0.12132, + "grad_norm": 0.708291815992429, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 12132 + }, + { + "epoch": 0.12133, + "grad_norm": 0.7402534639010128, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 12133 + }, + { + "epoch": 0.12134, + "grad_norm": 0.8119931330116694, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 12134 + }, + { + "epoch": 0.12135, + "grad_norm": 0.8350450819192635, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 12135 + }, + { + "epoch": 0.12136, + "grad_norm": 0.8039762209981959, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 12136 + }, + { + "epoch": 0.12137, + "grad_norm": 0.7253700956457722, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 12137 + }, + { + "epoch": 0.12138, + "grad_norm": 0.821368229958523, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 12138 + }, + { + "epoch": 0.12139, + "grad_norm": 0.8890056249264902, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 12139 + }, + { + "epoch": 0.1214, + "grad_norm": 1.0673718378818036, + "learning_rate": 0.003, + "loss": 4.1068, + "step": 12140 + }, + { + "epoch": 0.12141, + "grad_norm": 0.998773771713253, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 12141 + }, + { + "epoch": 0.12142, + "grad_norm": 0.8343867553391711, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 12142 + }, + { + "epoch": 0.12143, + "grad_norm": 0.7439287321630871, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 12143 + }, + { + "epoch": 0.12144, + "grad_norm": 0.7548192958000438, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 12144 + }, + { + "epoch": 0.12145, + "grad_norm": 0.6863287660828014, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 12145 + }, + { + "epoch": 0.12146, + "grad_norm": 0.6374615352659703, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 12146 + }, + { + "epoch": 0.12147, + "grad_norm": 0.6626425129250582, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 12147 + }, + { + "epoch": 0.12148, + "grad_norm": 0.664469899083491, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 12148 + }, + { + "epoch": 0.12149, + "grad_norm": 0.6726604782386362, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 12149 + }, + { + "epoch": 0.1215, + "grad_norm": 0.6071104446064951, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 12150 + }, + { + "epoch": 0.12151, + "grad_norm": 0.759680928265347, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 12151 + }, + { + "epoch": 0.12152, + "grad_norm": 0.9315784295233887, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 12152 + }, + { + "epoch": 0.12153, + "grad_norm": 1.320521685808312, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 12153 + }, + { + "epoch": 0.12154, + "grad_norm": 0.6453639037401027, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 12154 + }, + { + "epoch": 0.12155, + "grad_norm": 0.6706571195781358, + "learning_rate": 0.003, + "loss": 4.051, + "step": 12155 + }, + { + "epoch": 0.12156, + "grad_norm": 0.7951848965645847, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 12156 + }, + { + "epoch": 0.12157, + "grad_norm": 0.7596900007881088, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 12157 + }, + { + "epoch": 0.12158, + "grad_norm": 0.8127025968699181, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 12158 + }, + { + "epoch": 0.12159, + "grad_norm": 0.876589766771459, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 12159 + }, + { + "epoch": 0.1216, + "grad_norm": 0.8972856167148897, + "learning_rate": 0.003, + "loss": 4.066, + "step": 12160 + }, + { + "epoch": 0.12161, + "grad_norm": 0.8563068013271339, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 12161 + }, + { + "epoch": 0.12162, + "grad_norm": 0.8306703766568939, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 12162 + }, + { + "epoch": 0.12163, + "grad_norm": 0.9815681209976873, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 12163 + }, + { + "epoch": 0.12164, + "grad_norm": 1.0912966250633331, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 12164 + }, + { + "epoch": 0.12165, + "grad_norm": 0.9444453508980822, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 12165 + }, + { + "epoch": 0.12166, + "grad_norm": 0.9043036529174772, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 12166 + }, + { + "epoch": 0.12167, + "grad_norm": 0.8864176334512155, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 12167 + }, + { + "epoch": 0.12168, + "grad_norm": 0.9185772588555268, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 12168 + }, + { + "epoch": 0.12169, + "grad_norm": 0.9369480001448344, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 12169 + }, + { + "epoch": 0.1217, + "grad_norm": 1.095455332190787, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 12170 + }, + { + "epoch": 0.12171, + "grad_norm": 0.9453296225202272, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 12171 + }, + { + "epoch": 0.12172, + "grad_norm": 0.8178161188650642, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 12172 + }, + { + "epoch": 0.12173, + "grad_norm": 0.6656216309214364, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 12173 + }, + { + "epoch": 0.12174, + "grad_norm": 0.6143330151167342, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 12174 + }, + { + "epoch": 0.12175, + "grad_norm": 0.6095956765502458, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 12175 + }, + { + "epoch": 0.12176, + "grad_norm": 0.6274349985290663, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 12176 + }, + { + "epoch": 0.12177, + "grad_norm": 0.6963270153737929, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 12177 + }, + { + "epoch": 0.12178, + "grad_norm": 0.8053955223339382, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 12178 + }, + { + "epoch": 0.12179, + "grad_norm": 0.8560733484824161, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 12179 + }, + { + "epoch": 0.1218, + "grad_norm": 0.7746834097286917, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 12180 + }, + { + "epoch": 0.12181, + "grad_norm": 0.7575071571844539, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 12181 + }, + { + "epoch": 0.12182, + "grad_norm": 0.8423095942998952, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 12182 + }, + { + "epoch": 0.12183, + "grad_norm": 0.8305437413626926, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 12183 + }, + { + "epoch": 0.12184, + "grad_norm": 0.8213877652517387, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 12184 + }, + { + "epoch": 0.12185, + "grad_norm": 0.7800946606127453, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 12185 + }, + { + "epoch": 0.12186, + "grad_norm": 0.8897274458424853, + "learning_rate": 0.003, + "loss": 4.067, + "step": 12186 + }, + { + "epoch": 0.12187, + "grad_norm": 1.0276624626599788, + "learning_rate": 0.003, + "loss": 4.092, + "step": 12187 + }, + { + "epoch": 0.12188, + "grad_norm": 1.0309198946336533, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 12188 + }, + { + "epoch": 0.12189, + "grad_norm": 0.9790137072694165, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 12189 + }, + { + "epoch": 0.1219, + "grad_norm": 0.9688236140763872, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 12190 + }, + { + "epoch": 0.12191, + "grad_norm": 1.0222420660886349, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 12191 + }, + { + "epoch": 0.12192, + "grad_norm": 1.04032773609806, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 12192 + }, + { + "epoch": 0.12193, + "grad_norm": 0.9148037001203069, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 12193 + }, + { + "epoch": 0.12194, + "grad_norm": 0.8461984518265281, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 12194 + }, + { + "epoch": 0.12195, + "grad_norm": 0.8962201511783212, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 12195 + }, + { + "epoch": 0.12196, + "grad_norm": 0.8998155876981508, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 12196 + }, + { + "epoch": 0.12197, + "grad_norm": 0.869197527370743, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 12197 + }, + { + "epoch": 0.12198, + "grad_norm": 0.9925011333423569, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 12198 + }, + { + "epoch": 0.12199, + "grad_norm": 0.9924066788111143, + "learning_rate": 0.003, + "loss": 4.093, + "step": 12199 + }, + { + "epoch": 0.122, + "grad_norm": 0.9780810833776715, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 12200 + }, + { + "epoch": 0.12201, + "grad_norm": 0.7749765569011773, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 12201 + }, + { + "epoch": 0.12202, + "grad_norm": 0.68963493036336, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 12202 + }, + { + "epoch": 0.12203, + "grad_norm": 0.6944315750387414, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 12203 + }, + { + "epoch": 0.12204, + "grad_norm": 0.7092490852739447, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 12204 + }, + { + "epoch": 0.12205, + "grad_norm": 0.7824985651770687, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 12205 + }, + { + "epoch": 0.12206, + "grad_norm": 0.7841193956816908, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 12206 + }, + { + "epoch": 0.12207, + "grad_norm": 0.7727636401343367, + "learning_rate": 0.003, + "loss": 4.121, + "step": 12207 + }, + { + "epoch": 0.12208, + "grad_norm": 0.8821542042505839, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 12208 + }, + { + "epoch": 0.12209, + "grad_norm": 1.1296209326467996, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 12209 + }, + { + "epoch": 0.1221, + "grad_norm": 1.0041093442524542, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 12210 + }, + { + "epoch": 0.12211, + "grad_norm": 0.8895041277507798, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 12211 + }, + { + "epoch": 0.12212, + "grad_norm": 0.7247035510410651, + "learning_rate": 0.003, + "loss": 4.078, + "step": 12212 + }, + { + "epoch": 0.12213, + "grad_norm": 0.6265246520373837, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 12213 + }, + { + "epoch": 0.12214, + "grad_norm": 0.5284681468382831, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 12214 + }, + { + "epoch": 0.12215, + "grad_norm": 0.5469626070392162, + "learning_rate": 0.003, + "loss": 4.097, + "step": 12215 + }, + { + "epoch": 0.12216, + "grad_norm": 0.6047355431829017, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 12216 + }, + { + "epoch": 0.12217, + "grad_norm": 0.5873750180070837, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 12217 + }, + { + "epoch": 0.12218, + "grad_norm": 0.6158510675727711, + "learning_rate": 0.003, + "loss": 4.051, + "step": 12218 + }, + { + "epoch": 0.12219, + "grad_norm": 0.6121083851344632, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 12219 + }, + { + "epoch": 0.1222, + "grad_norm": 0.7609522642713573, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 12220 + }, + { + "epoch": 0.12221, + "grad_norm": 0.8809276711303994, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 12221 + }, + { + "epoch": 0.12222, + "grad_norm": 0.9475649039258731, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 12222 + }, + { + "epoch": 0.12223, + "grad_norm": 0.9629002752009357, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 12223 + }, + { + "epoch": 0.12224, + "grad_norm": 1.008015518858347, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 12224 + }, + { + "epoch": 0.12225, + "grad_norm": 0.9237570720650717, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 12225 + }, + { + "epoch": 0.12226, + "grad_norm": 0.8799849671986355, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 12226 + }, + { + "epoch": 0.12227, + "grad_norm": 0.9346772009432872, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 12227 + }, + { + "epoch": 0.12228, + "grad_norm": 0.8533561203210697, + "learning_rate": 0.003, + "loss": 4.051, + "step": 12228 + }, + { + "epoch": 0.12229, + "grad_norm": 0.9299445302671507, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 12229 + }, + { + "epoch": 0.1223, + "grad_norm": 0.970061316848315, + "learning_rate": 0.003, + "loss": 4.1173, + "step": 12230 + }, + { + "epoch": 0.12231, + "grad_norm": 1.04935587821548, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 12231 + }, + { + "epoch": 0.12232, + "grad_norm": 0.977851356364189, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 12232 + }, + { + "epoch": 0.12233, + "grad_norm": 0.8433149915900962, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 12233 + }, + { + "epoch": 0.12234, + "grad_norm": 0.8496072567309231, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 12234 + }, + { + "epoch": 0.12235, + "grad_norm": 0.889913163004263, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 12235 + }, + { + "epoch": 0.12236, + "grad_norm": 0.9829956720219396, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 12236 + }, + { + "epoch": 0.12237, + "grad_norm": 0.9758935947825471, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 12237 + }, + { + "epoch": 0.12238, + "grad_norm": 0.905293659895765, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 12238 + }, + { + "epoch": 0.12239, + "grad_norm": 0.880979930850334, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 12239 + }, + { + "epoch": 0.1224, + "grad_norm": 0.832007631237703, + "learning_rate": 0.003, + "loss": 4.1115, + "step": 12240 + }, + { + "epoch": 0.12241, + "grad_norm": 0.9034049863110921, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 12241 + }, + { + "epoch": 0.12242, + "grad_norm": 0.976237493191658, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 12242 + }, + { + "epoch": 0.12243, + "grad_norm": 1.1732103912413159, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 12243 + }, + { + "epoch": 0.12244, + "grad_norm": 1.020065784383985, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 12244 + }, + { + "epoch": 0.12245, + "grad_norm": 0.9639699286831159, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 12245 + }, + { + "epoch": 0.12246, + "grad_norm": 0.9814805407910625, + "learning_rate": 0.003, + "loss": 4.094, + "step": 12246 + }, + { + "epoch": 0.12247, + "grad_norm": 0.8533786196620828, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 12247 + }, + { + "epoch": 0.12248, + "grad_norm": 0.6662766502862788, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 12248 + }, + { + "epoch": 0.12249, + "grad_norm": 0.6702892402590316, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 12249 + }, + { + "epoch": 0.1225, + "grad_norm": 0.6533489392576404, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 12250 + }, + { + "epoch": 0.12251, + "grad_norm": 0.5709588901774177, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 12251 + }, + { + "epoch": 0.12252, + "grad_norm": 0.5718032323858817, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 12252 + }, + { + "epoch": 0.12253, + "grad_norm": 0.5424113872571699, + "learning_rate": 0.003, + "loss": 4.076, + "step": 12253 + }, + { + "epoch": 0.12254, + "grad_norm": 0.5122148366369595, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 12254 + }, + { + "epoch": 0.12255, + "grad_norm": 0.6207724489282378, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 12255 + }, + { + "epoch": 0.12256, + "grad_norm": 0.8036473546761361, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 12256 + }, + { + "epoch": 0.12257, + "grad_norm": 1.0622283319885992, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 12257 + }, + { + "epoch": 0.12258, + "grad_norm": 1.0380816145963498, + "learning_rate": 0.003, + "loss": 4.065, + "step": 12258 + }, + { + "epoch": 0.12259, + "grad_norm": 0.9805063613712668, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 12259 + }, + { + "epoch": 0.1226, + "grad_norm": 0.8470946725098341, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 12260 + }, + { + "epoch": 0.12261, + "grad_norm": 0.7167763323497804, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 12261 + }, + { + "epoch": 0.12262, + "grad_norm": 0.8492307595053198, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 12262 + }, + { + "epoch": 0.12263, + "grad_norm": 0.7470430691505191, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 12263 + }, + { + "epoch": 0.12264, + "grad_norm": 0.7437026810187541, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 12264 + }, + { + "epoch": 0.12265, + "grad_norm": 0.6921338875604943, + "learning_rate": 0.003, + "loss": 4.062, + "step": 12265 + }, + { + "epoch": 0.12266, + "grad_norm": 0.6340720803713298, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 12266 + }, + { + "epoch": 0.12267, + "grad_norm": 0.5996048202296703, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 12267 + }, + { + "epoch": 0.12268, + "grad_norm": 0.5253278971864735, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 12268 + }, + { + "epoch": 0.12269, + "grad_norm": 0.5059015116943797, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 12269 + }, + { + "epoch": 0.1227, + "grad_norm": 0.5611070010602227, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 12270 + }, + { + "epoch": 0.12271, + "grad_norm": 0.6010429068065455, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 12271 + }, + { + "epoch": 0.12272, + "grad_norm": 0.6560178371037455, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 12272 + }, + { + "epoch": 0.12273, + "grad_norm": 0.7535031170180342, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 12273 + }, + { + "epoch": 0.12274, + "grad_norm": 0.9517936135796402, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 12274 + }, + { + "epoch": 0.12275, + "grad_norm": 1.178351373783213, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 12275 + }, + { + "epoch": 0.12276, + "grad_norm": 0.8912268880412101, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 12276 + }, + { + "epoch": 0.12277, + "grad_norm": 0.8855062963775535, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 12277 + }, + { + "epoch": 0.12278, + "grad_norm": 0.8988888499927662, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 12278 + }, + { + "epoch": 0.12279, + "grad_norm": 0.9681405602313218, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 12279 + }, + { + "epoch": 0.1228, + "grad_norm": 0.840459247453639, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 12280 + }, + { + "epoch": 0.12281, + "grad_norm": 0.7862761387462439, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 12281 + }, + { + "epoch": 0.12282, + "grad_norm": 0.7910672908173746, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 12282 + }, + { + "epoch": 0.12283, + "grad_norm": 0.7996847191182868, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 12283 + }, + { + "epoch": 0.12284, + "grad_norm": 0.9761078001765382, + "learning_rate": 0.003, + "loss": 4.057, + "step": 12284 + }, + { + "epoch": 0.12285, + "grad_norm": 1.2122969362612241, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 12285 + }, + { + "epoch": 0.12286, + "grad_norm": 0.7982955277029816, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 12286 + }, + { + "epoch": 0.12287, + "grad_norm": 0.6832723853119029, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 12287 + }, + { + "epoch": 0.12288, + "grad_norm": 0.7410385916665433, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 12288 + }, + { + "epoch": 0.12289, + "grad_norm": 0.7795829113013923, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 12289 + }, + { + "epoch": 0.1229, + "grad_norm": 0.8203151883451628, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 12290 + }, + { + "epoch": 0.12291, + "grad_norm": 1.0129421338927522, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 12291 + }, + { + "epoch": 0.12292, + "grad_norm": 1.061571835074904, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 12292 + }, + { + "epoch": 0.12293, + "grad_norm": 0.8374218946860755, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 12293 + }, + { + "epoch": 0.12294, + "grad_norm": 0.7543087370375461, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 12294 + }, + { + "epoch": 0.12295, + "grad_norm": 0.7706914881903262, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 12295 + }, + { + "epoch": 0.12296, + "grad_norm": 0.7607184633400262, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 12296 + }, + { + "epoch": 0.12297, + "grad_norm": 0.7724401795263441, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 12297 + }, + { + "epoch": 0.12298, + "grad_norm": 0.8638283512198204, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 12298 + }, + { + "epoch": 0.12299, + "grad_norm": 0.8982120323560557, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 12299 + }, + { + "epoch": 0.123, + "grad_norm": 0.9124691457151956, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 12300 + }, + { + "epoch": 0.12301, + "grad_norm": 1.0861149071905536, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 12301 + }, + { + "epoch": 0.12302, + "grad_norm": 1.056574819962284, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 12302 + }, + { + "epoch": 0.12303, + "grad_norm": 0.8962846526441469, + "learning_rate": 0.003, + "loss": 4.1494, + "step": 12303 + }, + { + "epoch": 0.12304, + "grad_norm": 0.7551672668971874, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 12304 + }, + { + "epoch": 0.12305, + "grad_norm": 0.7687559626159957, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 12305 + }, + { + "epoch": 0.12306, + "grad_norm": 0.7832740456242118, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 12306 + }, + { + "epoch": 0.12307, + "grad_norm": 0.8334537225717343, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 12307 + }, + { + "epoch": 0.12308, + "grad_norm": 1.0396736321564775, + "learning_rate": 0.003, + "loss": 4.101, + "step": 12308 + }, + { + "epoch": 0.12309, + "grad_norm": 1.0988737205635373, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 12309 + }, + { + "epoch": 0.1231, + "grad_norm": 0.7415438867578784, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 12310 + }, + { + "epoch": 0.12311, + "grad_norm": 0.6274126363565536, + "learning_rate": 0.003, + "loss": 4.06, + "step": 12311 + }, + { + "epoch": 0.12312, + "grad_norm": 0.6460470961906579, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 12312 + }, + { + "epoch": 0.12313, + "grad_norm": 0.7743078883377551, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 12313 + }, + { + "epoch": 0.12314, + "grad_norm": 0.9427175393301743, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 12314 + }, + { + "epoch": 0.12315, + "grad_norm": 0.8781349943437341, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 12315 + }, + { + "epoch": 0.12316, + "grad_norm": 0.8208307368125973, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 12316 + }, + { + "epoch": 0.12317, + "grad_norm": 0.7706107052937355, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 12317 + }, + { + "epoch": 0.12318, + "grad_norm": 0.7306685595290431, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 12318 + }, + { + "epoch": 0.12319, + "grad_norm": 0.7692113098316952, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 12319 + }, + { + "epoch": 0.1232, + "grad_norm": 0.8912250345171907, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 12320 + }, + { + "epoch": 0.12321, + "grad_norm": 0.9856624656988558, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 12321 + }, + { + "epoch": 0.12322, + "grad_norm": 1.005052658751967, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 12322 + }, + { + "epoch": 0.12323, + "grad_norm": 0.9148997566925725, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 12323 + }, + { + "epoch": 0.12324, + "grad_norm": 0.8558840802530098, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 12324 + }, + { + "epoch": 0.12325, + "grad_norm": 0.7961488539800927, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 12325 + }, + { + "epoch": 0.12326, + "grad_norm": 0.7793292980068781, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 12326 + }, + { + "epoch": 0.12327, + "grad_norm": 0.7290581138354479, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 12327 + }, + { + "epoch": 0.12328, + "grad_norm": 0.6926715810449556, + "learning_rate": 0.003, + "loss": 4.045, + "step": 12328 + }, + { + "epoch": 0.12329, + "grad_norm": 0.775214172914529, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 12329 + }, + { + "epoch": 0.1233, + "grad_norm": 0.82900822759267, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 12330 + }, + { + "epoch": 0.12331, + "grad_norm": 0.8842733850561869, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 12331 + }, + { + "epoch": 0.12332, + "grad_norm": 0.9751583110833055, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 12332 + }, + { + "epoch": 0.12333, + "grad_norm": 1.1071006881382284, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 12333 + }, + { + "epoch": 0.12334, + "grad_norm": 1.0117747800099712, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 12334 + }, + { + "epoch": 0.12335, + "grad_norm": 0.9163473104991442, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 12335 + }, + { + "epoch": 0.12336, + "grad_norm": 0.7932919852703744, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 12336 + }, + { + "epoch": 0.12337, + "grad_norm": 0.5938066929793493, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 12337 + }, + { + "epoch": 0.12338, + "grad_norm": 0.6757743849715577, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 12338 + }, + { + "epoch": 0.12339, + "grad_norm": 0.91201916345825, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 12339 + }, + { + "epoch": 0.1234, + "grad_norm": 1.1617820868185735, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 12340 + }, + { + "epoch": 0.12341, + "grad_norm": 0.7103223518308457, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 12341 + }, + { + "epoch": 0.12342, + "grad_norm": 0.6337513778485245, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 12342 + }, + { + "epoch": 0.12343, + "grad_norm": 0.7079606290293606, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 12343 + }, + { + "epoch": 0.12344, + "grad_norm": 0.6689056464669664, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 12344 + }, + { + "epoch": 0.12345, + "grad_norm": 0.5707386249666849, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 12345 + }, + { + "epoch": 0.12346, + "grad_norm": 0.6191142900461684, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 12346 + }, + { + "epoch": 0.12347, + "grad_norm": 0.599638430699415, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 12347 + }, + { + "epoch": 0.12348, + "grad_norm": 0.5975280171501979, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 12348 + }, + { + "epoch": 0.12349, + "grad_norm": 0.6641026027063925, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 12349 + }, + { + "epoch": 0.1235, + "grad_norm": 0.748212361076718, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 12350 + }, + { + "epoch": 0.12351, + "grad_norm": 0.8750142390126173, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 12351 + }, + { + "epoch": 0.12352, + "grad_norm": 1.059951964622264, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 12352 + }, + { + "epoch": 0.12353, + "grad_norm": 1.118541436018302, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 12353 + }, + { + "epoch": 0.12354, + "grad_norm": 0.8240209127136764, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 12354 + }, + { + "epoch": 0.12355, + "grad_norm": 0.7811652917801903, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 12355 + }, + { + "epoch": 0.12356, + "grad_norm": 0.8184349715564275, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 12356 + }, + { + "epoch": 0.12357, + "grad_norm": 0.8288548630561049, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 12357 + }, + { + "epoch": 0.12358, + "grad_norm": 0.6820615218677171, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 12358 + }, + { + "epoch": 0.12359, + "grad_norm": 0.6348546447588544, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 12359 + }, + { + "epoch": 0.1236, + "grad_norm": 0.5927829921374093, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 12360 + }, + { + "epoch": 0.12361, + "grad_norm": 0.6877751343256268, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 12361 + }, + { + "epoch": 0.12362, + "grad_norm": 0.6465142056870291, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 12362 + }, + { + "epoch": 0.12363, + "grad_norm": 0.6658993255518995, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 12363 + }, + { + "epoch": 0.12364, + "grad_norm": 0.812717613930872, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 12364 + }, + { + "epoch": 0.12365, + "grad_norm": 0.9841896762626663, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 12365 + }, + { + "epoch": 0.12366, + "grad_norm": 1.0444743383641244, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 12366 + }, + { + "epoch": 0.12367, + "grad_norm": 0.8735959659864412, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 12367 + }, + { + "epoch": 0.12368, + "grad_norm": 0.8104436938471729, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 12368 + }, + { + "epoch": 0.12369, + "grad_norm": 0.7647719667314723, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 12369 + }, + { + "epoch": 0.1237, + "grad_norm": 0.9798723311447599, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 12370 + }, + { + "epoch": 0.12371, + "grad_norm": 1.1073068164129845, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 12371 + }, + { + "epoch": 0.12372, + "grad_norm": 0.8783686640733369, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 12372 + }, + { + "epoch": 0.12373, + "grad_norm": 0.8571504331738046, + "learning_rate": 0.003, + "loss": 4.091, + "step": 12373 + }, + { + "epoch": 0.12374, + "grad_norm": 0.8824814783901349, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 12374 + }, + { + "epoch": 0.12375, + "grad_norm": 0.8370906418565935, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 12375 + }, + { + "epoch": 0.12376, + "grad_norm": 0.8566745651264175, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 12376 + }, + { + "epoch": 0.12377, + "grad_norm": 0.7896995443932111, + "learning_rate": 0.003, + "loss": 4.042, + "step": 12377 + }, + { + "epoch": 0.12378, + "grad_norm": 0.7716242348864555, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 12378 + }, + { + "epoch": 0.12379, + "grad_norm": 0.8896154225927658, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 12379 + }, + { + "epoch": 0.1238, + "grad_norm": 1.0655680616463061, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 12380 + }, + { + "epoch": 0.12381, + "grad_norm": 1.1344858251808465, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 12381 + }, + { + "epoch": 0.12382, + "grad_norm": 0.7336688413835789, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 12382 + }, + { + "epoch": 0.12383, + "grad_norm": 0.6335176185088268, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 12383 + }, + { + "epoch": 0.12384, + "grad_norm": 0.6164379207701107, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 12384 + }, + { + "epoch": 0.12385, + "grad_norm": 0.682851429818355, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 12385 + }, + { + "epoch": 0.12386, + "grad_norm": 0.7538417672700986, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 12386 + }, + { + "epoch": 0.12387, + "grad_norm": 0.7767506727160641, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 12387 + }, + { + "epoch": 0.12388, + "grad_norm": 0.725322805837034, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 12388 + }, + { + "epoch": 0.12389, + "grad_norm": 0.7855361772914995, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 12389 + }, + { + "epoch": 0.1239, + "grad_norm": 0.8377198942167394, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 12390 + }, + { + "epoch": 0.12391, + "grad_norm": 1.0220559200837096, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 12391 + }, + { + "epoch": 0.12392, + "grad_norm": 1.1679405125779883, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 12392 + }, + { + "epoch": 0.12393, + "grad_norm": 0.8079046078128874, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 12393 + }, + { + "epoch": 0.12394, + "grad_norm": 0.807912420130648, + "learning_rate": 0.003, + "loss": 4.076, + "step": 12394 + }, + { + "epoch": 0.12395, + "grad_norm": 0.8156046926666899, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 12395 + }, + { + "epoch": 0.12396, + "grad_norm": 0.6424939616056728, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 12396 + }, + { + "epoch": 0.12397, + "grad_norm": 0.6161565824363795, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 12397 + }, + { + "epoch": 0.12398, + "grad_norm": 0.6868072633878525, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 12398 + }, + { + "epoch": 0.12399, + "grad_norm": 0.8381894625150602, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 12399 + }, + { + "epoch": 0.124, + "grad_norm": 1.094648190282617, + "learning_rate": 0.003, + "loss": 4.128, + "step": 12400 + }, + { + "epoch": 0.12401, + "grad_norm": 1.0716932323490935, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 12401 + }, + { + "epoch": 0.12402, + "grad_norm": 0.8443736440934634, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 12402 + }, + { + "epoch": 0.12403, + "grad_norm": 0.8174631840031019, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 12403 + }, + { + "epoch": 0.12404, + "grad_norm": 0.8302272019753588, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 12404 + }, + { + "epoch": 0.12405, + "grad_norm": 0.8356565848410257, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 12405 + }, + { + "epoch": 0.12406, + "grad_norm": 0.8091300206792589, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 12406 + }, + { + "epoch": 0.12407, + "grad_norm": 0.7332184789133536, + "learning_rate": 0.003, + "loss": 4.07, + "step": 12407 + }, + { + "epoch": 0.12408, + "grad_norm": 0.8687526884068933, + "learning_rate": 0.003, + "loss": 4.093, + "step": 12408 + }, + { + "epoch": 0.12409, + "grad_norm": 0.8506045260272557, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 12409 + }, + { + "epoch": 0.1241, + "grad_norm": 0.7742830890220069, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 12410 + }, + { + "epoch": 0.12411, + "grad_norm": 0.7645377692364871, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 12411 + }, + { + "epoch": 0.12412, + "grad_norm": 0.6900258135012273, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 12412 + }, + { + "epoch": 0.12413, + "grad_norm": 0.7008170255398872, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 12413 + }, + { + "epoch": 0.12414, + "grad_norm": 0.7807544703948569, + "learning_rate": 0.003, + "loss": 4.061, + "step": 12414 + }, + { + "epoch": 0.12415, + "grad_norm": 1.0418165862715427, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 12415 + }, + { + "epoch": 0.12416, + "grad_norm": 1.2588624025355457, + "learning_rate": 0.003, + "loss": 4.1055, + "step": 12416 + }, + { + "epoch": 0.12417, + "grad_norm": 1.0174602374348387, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 12417 + }, + { + "epoch": 0.12418, + "grad_norm": 1.1048010081413318, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 12418 + }, + { + "epoch": 0.12419, + "grad_norm": 0.8166182229591952, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 12419 + }, + { + "epoch": 0.1242, + "grad_norm": 0.9415925328154592, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 12420 + }, + { + "epoch": 0.12421, + "grad_norm": 0.8690733941514077, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 12421 + }, + { + "epoch": 0.12422, + "grad_norm": 0.830745473123471, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 12422 + }, + { + "epoch": 0.12423, + "grad_norm": 0.8342523814524774, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 12423 + }, + { + "epoch": 0.12424, + "grad_norm": 0.894813671165597, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 12424 + }, + { + "epoch": 0.12425, + "grad_norm": 0.9884607649566219, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 12425 + }, + { + "epoch": 0.12426, + "grad_norm": 1.1435522369886537, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 12426 + }, + { + "epoch": 0.12427, + "grad_norm": 1.0095395273350793, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 12427 + }, + { + "epoch": 0.12428, + "grad_norm": 0.8441201562767902, + "learning_rate": 0.003, + "loss": 4.103, + "step": 12428 + }, + { + "epoch": 0.12429, + "grad_norm": 0.6643197463278329, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 12429 + }, + { + "epoch": 0.1243, + "grad_norm": 0.7675558389841266, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 12430 + }, + { + "epoch": 0.12431, + "grad_norm": 0.9729795890323124, + "learning_rate": 0.003, + "loss": 4.076, + "step": 12431 + }, + { + "epoch": 0.12432, + "grad_norm": 0.9891625907463222, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 12432 + }, + { + "epoch": 0.12433, + "grad_norm": 0.867863487443049, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 12433 + }, + { + "epoch": 0.12434, + "grad_norm": 0.8096004679100748, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 12434 + }, + { + "epoch": 0.12435, + "grad_norm": 0.7696790010013758, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 12435 + }, + { + "epoch": 0.12436, + "grad_norm": 0.7074429258609628, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 12436 + }, + { + "epoch": 0.12437, + "grad_norm": 0.6928512254596106, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 12437 + }, + { + "epoch": 0.12438, + "grad_norm": 0.7276723133589919, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 12438 + }, + { + "epoch": 0.12439, + "grad_norm": 0.8097392985970517, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 12439 + }, + { + "epoch": 0.1244, + "grad_norm": 0.9078311600338613, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 12440 + }, + { + "epoch": 0.12441, + "grad_norm": 0.9656091039492581, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 12441 + }, + { + "epoch": 0.12442, + "grad_norm": 0.8382607157716104, + "learning_rate": 0.003, + "loss": 4.083, + "step": 12442 + }, + { + "epoch": 0.12443, + "grad_norm": 0.7195463112482667, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 12443 + }, + { + "epoch": 0.12444, + "grad_norm": 0.692287995881062, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 12444 + }, + { + "epoch": 0.12445, + "grad_norm": 0.7260972533808546, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 12445 + }, + { + "epoch": 0.12446, + "grad_norm": 0.7545307041135739, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 12446 + }, + { + "epoch": 0.12447, + "grad_norm": 0.8399120440566367, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 12447 + }, + { + "epoch": 0.12448, + "grad_norm": 0.82477064475789, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 12448 + }, + { + "epoch": 0.12449, + "grad_norm": 0.879099511063207, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 12449 + }, + { + "epoch": 0.1245, + "grad_norm": 0.9756698583699438, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 12450 + }, + { + "epoch": 0.12451, + "grad_norm": 1.055809397868483, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 12451 + }, + { + "epoch": 0.12452, + "grad_norm": 0.9273043848413589, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 12452 + }, + { + "epoch": 0.12453, + "grad_norm": 0.8424893235278818, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 12453 + }, + { + "epoch": 0.12454, + "grad_norm": 0.7769934753116866, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 12454 + }, + { + "epoch": 0.12455, + "grad_norm": 0.8422414337643478, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 12455 + }, + { + "epoch": 0.12456, + "grad_norm": 1.0644890222125198, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 12456 + }, + { + "epoch": 0.12457, + "grad_norm": 1.034725852802373, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 12457 + }, + { + "epoch": 0.12458, + "grad_norm": 1.0201030281238366, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 12458 + }, + { + "epoch": 0.12459, + "grad_norm": 0.9606410872706322, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 12459 + }, + { + "epoch": 0.1246, + "grad_norm": 0.9802952571985856, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 12460 + }, + { + "epoch": 0.12461, + "grad_norm": 1.0145179379970155, + "learning_rate": 0.003, + "loss": 4.084, + "step": 12461 + }, + { + "epoch": 0.12462, + "grad_norm": 0.8941296602201455, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 12462 + }, + { + "epoch": 0.12463, + "grad_norm": 0.7880644625841127, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 12463 + }, + { + "epoch": 0.12464, + "grad_norm": 0.7478314581834201, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 12464 + }, + { + "epoch": 0.12465, + "grad_norm": 0.6874206299133098, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 12465 + }, + { + "epoch": 0.12466, + "grad_norm": 0.5532656428700546, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 12466 + }, + { + "epoch": 0.12467, + "grad_norm": 0.5085094451819194, + "learning_rate": 0.003, + "loss": 4.081, + "step": 12467 + }, + { + "epoch": 0.12468, + "grad_norm": 0.4833011801969543, + "learning_rate": 0.003, + "loss": 4.091, + "step": 12468 + }, + { + "epoch": 0.12469, + "grad_norm": 0.5658074187218443, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 12469 + }, + { + "epoch": 0.1247, + "grad_norm": 0.6584925064901498, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 12470 + }, + { + "epoch": 0.12471, + "grad_norm": 0.8347468519445372, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 12471 + }, + { + "epoch": 0.12472, + "grad_norm": 1.147252471970649, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 12472 + }, + { + "epoch": 0.12473, + "grad_norm": 0.9151661598565768, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 12473 + }, + { + "epoch": 0.12474, + "grad_norm": 0.7783521582318006, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 12474 + }, + { + "epoch": 0.12475, + "grad_norm": 0.754762458139977, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 12475 + }, + { + "epoch": 0.12476, + "grad_norm": 0.8076264580293819, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 12476 + }, + { + "epoch": 0.12477, + "grad_norm": 0.8440102098928844, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 12477 + }, + { + "epoch": 0.12478, + "grad_norm": 0.8292745717859727, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 12478 + }, + { + "epoch": 0.12479, + "grad_norm": 0.7712475170483982, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 12479 + }, + { + "epoch": 0.1248, + "grad_norm": 0.6625408481360877, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 12480 + }, + { + "epoch": 0.12481, + "grad_norm": 0.6502393782701698, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 12481 + }, + { + "epoch": 0.12482, + "grad_norm": 0.6758788180464269, + "learning_rate": 0.003, + "loss": 4.08, + "step": 12482 + }, + { + "epoch": 0.12483, + "grad_norm": 0.6727320857627304, + "learning_rate": 0.003, + "loss": 4.084, + "step": 12483 + }, + { + "epoch": 0.12484, + "grad_norm": 0.6356059088846429, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 12484 + }, + { + "epoch": 0.12485, + "grad_norm": 0.5870391387072846, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 12485 + }, + { + "epoch": 0.12486, + "grad_norm": 0.7368756368425331, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 12486 + }, + { + "epoch": 0.12487, + "grad_norm": 0.9396079302973953, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 12487 + }, + { + "epoch": 0.12488, + "grad_norm": 1.1292542302681363, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 12488 + }, + { + "epoch": 0.12489, + "grad_norm": 0.8528878129899282, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 12489 + }, + { + "epoch": 0.1249, + "grad_norm": 0.7516004369114074, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 12490 + }, + { + "epoch": 0.12491, + "grad_norm": 0.9147555580331406, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 12491 + }, + { + "epoch": 0.12492, + "grad_norm": 1.265380678620011, + "learning_rate": 0.003, + "loss": 4.1008, + "step": 12492 + }, + { + "epoch": 0.12493, + "grad_norm": 0.9085831832477737, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 12493 + }, + { + "epoch": 0.12494, + "grad_norm": 0.8933451452856955, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 12494 + }, + { + "epoch": 0.12495, + "grad_norm": 0.9486602997841618, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 12495 + }, + { + "epoch": 0.12496, + "grad_norm": 0.9671534076597736, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 12496 + }, + { + "epoch": 0.12497, + "grad_norm": 0.9748030910513085, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 12497 + }, + { + "epoch": 0.12498, + "grad_norm": 0.9304947010935671, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 12498 + }, + { + "epoch": 0.12499, + "grad_norm": 0.926582461696646, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 12499 + }, + { + "epoch": 0.125, + "grad_norm": 0.9627114037689704, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 12500 + }, + { + "epoch": 0.12501, + "grad_norm": 1.0604940883484884, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 12501 + }, + { + "epoch": 0.12502, + "grad_norm": 0.9577059818976567, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 12502 + }, + { + "epoch": 0.12503, + "grad_norm": 0.884475209540014, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 12503 + }, + { + "epoch": 0.12504, + "grad_norm": 0.8173440312329352, + "learning_rate": 0.003, + "loss": 4.089, + "step": 12504 + }, + { + "epoch": 0.12505, + "grad_norm": 0.7489029149484316, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 12505 + }, + { + "epoch": 0.12506, + "grad_norm": 0.7572504979291013, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 12506 + }, + { + "epoch": 0.12507, + "grad_norm": 0.8399998909902504, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 12507 + }, + { + "epoch": 0.12508, + "grad_norm": 0.9337566512163873, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 12508 + }, + { + "epoch": 0.12509, + "grad_norm": 0.8861296871193007, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 12509 + }, + { + "epoch": 0.1251, + "grad_norm": 0.8711578061763322, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 12510 + }, + { + "epoch": 0.12511, + "grad_norm": 0.810476642592666, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 12511 + }, + { + "epoch": 0.12512, + "grad_norm": 0.7685750219209562, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 12512 + }, + { + "epoch": 0.12513, + "grad_norm": 0.8276538036476391, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 12513 + }, + { + "epoch": 0.12514, + "grad_norm": 0.8908374164057034, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 12514 + }, + { + "epoch": 0.12515, + "grad_norm": 1.0910727360858632, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 12515 + }, + { + "epoch": 0.12516, + "grad_norm": 1.0212161466319103, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 12516 + }, + { + "epoch": 0.12517, + "grad_norm": 1.066691309468899, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 12517 + }, + { + "epoch": 0.12518, + "grad_norm": 0.9988527766381885, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 12518 + }, + { + "epoch": 0.12519, + "grad_norm": 1.0140692171238843, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 12519 + }, + { + "epoch": 0.1252, + "grad_norm": 0.9828000839884308, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 12520 + }, + { + "epoch": 0.12521, + "grad_norm": 0.9327551598434594, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 12521 + }, + { + "epoch": 0.12522, + "grad_norm": 0.807818758831363, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 12522 + }, + { + "epoch": 0.12523, + "grad_norm": 0.7805048294306602, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 12523 + }, + { + "epoch": 0.12524, + "grad_norm": 0.6780475265822701, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 12524 + }, + { + "epoch": 0.12525, + "grad_norm": 0.672416846874558, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 12525 + }, + { + "epoch": 0.12526, + "grad_norm": 0.6660977882609734, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 12526 + }, + { + "epoch": 0.12527, + "grad_norm": 0.6541819706521673, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 12527 + }, + { + "epoch": 0.12528, + "grad_norm": 0.67528315323153, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 12528 + }, + { + "epoch": 0.12529, + "grad_norm": 0.6203124045169987, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 12529 + }, + { + "epoch": 0.1253, + "grad_norm": 0.6662733723107425, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 12530 + }, + { + "epoch": 0.12531, + "grad_norm": 0.6766311073551332, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 12531 + }, + { + "epoch": 0.12532, + "grad_norm": 0.7659473563470474, + "learning_rate": 0.003, + "loss": 4.059, + "step": 12532 + }, + { + "epoch": 0.12533, + "grad_norm": 0.8952162523682594, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 12533 + }, + { + "epoch": 0.12534, + "grad_norm": 1.1705063949964587, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 12534 + }, + { + "epoch": 0.12535, + "grad_norm": 0.8209608074328265, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 12535 + }, + { + "epoch": 0.12536, + "grad_norm": 0.6216744131373718, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 12536 + }, + { + "epoch": 0.12537, + "grad_norm": 0.5813816881218197, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 12537 + }, + { + "epoch": 0.12538, + "grad_norm": 0.7213851704102082, + "learning_rate": 0.003, + "loss": 4.071, + "step": 12538 + }, + { + "epoch": 0.12539, + "grad_norm": 0.9239622219843724, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 12539 + }, + { + "epoch": 0.1254, + "grad_norm": 1.0591751024024054, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 12540 + }, + { + "epoch": 0.12541, + "grad_norm": 1.0899608081175878, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 12541 + }, + { + "epoch": 0.12542, + "grad_norm": 0.9360062255258262, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 12542 + }, + { + "epoch": 0.12543, + "grad_norm": 0.981563508663698, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 12543 + }, + { + "epoch": 0.12544, + "grad_norm": 0.9401216606961313, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 12544 + }, + { + "epoch": 0.12545, + "grad_norm": 0.8967062592092603, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 12545 + }, + { + "epoch": 0.12546, + "grad_norm": 0.8775015205434612, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 12546 + }, + { + "epoch": 0.12547, + "grad_norm": 0.9385758344702522, + "learning_rate": 0.003, + "loss": 4.108, + "step": 12547 + }, + { + "epoch": 0.12548, + "grad_norm": 1.0170124611390352, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 12548 + }, + { + "epoch": 0.12549, + "grad_norm": 1.0282448767977495, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 12549 + }, + { + "epoch": 0.1255, + "grad_norm": 0.9355240438214039, + "learning_rate": 0.003, + "loss": 4.077, + "step": 12550 + }, + { + "epoch": 0.12551, + "grad_norm": 0.8492515385424486, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 12551 + }, + { + "epoch": 0.12552, + "grad_norm": 0.8328292067679657, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 12552 + }, + { + "epoch": 0.12553, + "grad_norm": 0.8018285159763675, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 12553 + }, + { + "epoch": 0.12554, + "grad_norm": 0.7826087493872959, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 12554 + }, + { + "epoch": 0.12555, + "grad_norm": 0.753072073280961, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 12555 + }, + { + "epoch": 0.12556, + "grad_norm": 0.6605457000607075, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 12556 + }, + { + "epoch": 0.12557, + "grad_norm": 0.6119107954645638, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 12557 + }, + { + "epoch": 0.12558, + "grad_norm": 0.6351510959937766, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 12558 + }, + { + "epoch": 0.12559, + "grad_norm": 0.6776216370299248, + "learning_rate": 0.003, + "loss": 4.051, + "step": 12559 + }, + { + "epoch": 0.1256, + "grad_norm": 0.7259179291843775, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 12560 + }, + { + "epoch": 0.12561, + "grad_norm": 0.9873701878185189, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 12561 + }, + { + "epoch": 0.12562, + "grad_norm": 1.1549459750928384, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 12562 + }, + { + "epoch": 0.12563, + "grad_norm": 0.7812249219085289, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 12563 + }, + { + "epoch": 0.12564, + "grad_norm": 0.6703423951202678, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 12564 + }, + { + "epoch": 0.12565, + "grad_norm": 0.8357079073240602, + "learning_rate": 0.003, + "loss": 4.087, + "step": 12565 + }, + { + "epoch": 0.12566, + "grad_norm": 0.9552000107530532, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 12566 + }, + { + "epoch": 0.12567, + "grad_norm": 0.9355841374475491, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 12567 + }, + { + "epoch": 0.12568, + "grad_norm": 0.8066419116118289, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 12568 + }, + { + "epoch": 0.12569, + "grad_norm": 0.7235825525019571, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 12569 + }, + { + "epoch": 0.1257, + "grad_norm": 0.7580888885201048, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 12570 + }, + { + "epoch": 0.12571, + "grad_norm": 0.8107129972338145, + "learning_rate": 0.003, + "loss": 4.087, + "step": 12571 + }, + { + "epoch": 0.12572, + "grad_norm": 0.930245214850763, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 12572 + }, + { + "epoch": 0.12573, + "grad_norm": 1.2684625702329895, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 12573 + }, + { + "epoch": 0.12574, + "grad_norm": 0.8440354075216758, + "learning_rate": 0.003, + "loss": 4.089, + "step": 12574 + }, + { + "epoch": 0.12575, + "grad_norm": 0.6315786225056687, + "learning_rate": 0.003, + "loss": 4.112, + "step": 12575 + }, + { + "epoch": 0.12576, + "grad_norm": 0.7041702276896806, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 12576 + }, + { + "epoch": 0.12577, + "grad_norm": 0.7647414725186429, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 12577 + }, + { + "epoch": 0.12578, + "grad_norm": 0.8551047957220735, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 12578 + }, + { + "epoch": 0.12579, + "grad_norm": 0.866404723267827, + "learning_rate": 0.003, + "loss": 4.091, + "step": 12579 + }, + { + "epoch": 0.1258, + "grad_norm": 0.8809057071597594, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 12580 + }, + { + "epoch": 0.12581, + "grad_norm": 0.9915394714466932, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 12581 + }, + { + "epoch": 0.12582, + "grad_norm": 1.1197125321519874, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 12582 + }, + { + "epoch": 0.12583, + "grad_norm": 1.0929628884873095, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 12583 + }, + { + "epoch": 0.12584, + "grad_norm": 0.8525918323487954, + "learning_rate": 0.003, + "loss": 4.062, + "step": 12584 + }, + { + "epoch": 0.12585, + "grad_norm": 0.7374588766012489, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 12585 + }, + { + "epoch": 0.12586, + "grad_norm": 0.7388595932129336, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 12586 + }, + { + "epoch": 0.12587, + "grad_norm": 0.7639845260876715, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 12587 + }, + { + "epoch": 0.12588, + "grad_norm": 0.9487270636440439, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 12588 + }, + { + "epoch": 0.12589, + "grad_norm": 1.0956858061928865, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 12589 + }, + { + "epoch": 0.1259, + "grad_norm": 0.9630797615431528, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 12590 + }, + { + "epoch": 0.12591, + "grad_norm": 0.9493875531469873, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 12591 + }, + { + "epoch": 0.12592, + "grad_norm": 0.8389151739871537, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 12592 + }, + { + "epoch": 0.12593, + "grad_norm": 0.7908775788009506, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 12593 + }, + { + "epoch": 0.12594, + "grad_norm": 0.7313439476606562, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 12594 + }, + { + "epoch": 0.12595, + "grad_norm": 0.8468426952635264, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 12595 + }, + { + "epoch": 0.12596, + "grad_norm": 0.821865089793791, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 12596 + }, + { + "epoch": 0.12597, + "grad_norm": 0.9392946494153627, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 12597 + }, + { + "epoch": 0.12598, + "grad_norm": 0.9931903669329258, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 12598 + }, + { + "epoch": 0.12599, + "grad_norm": 0.9262748105070246, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 12599 + }, + { + "epoch": 0.126, + "grad_norm": 0.7207751396428811, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 12600 + }, + { + "epoch": 0.12601, + "grad_norm": 0.6901411251358255, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 12601 + }, + { + "epoch": 0.12602, + "grad_norm": 0.6198097139667673, + "learning_rate": 0.003, + "loss": 4.091, + "step": 12602 + }, + { + "epoch": 0.12603, + "grad_norm": 0.603433732393143, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 12603 + }, + { + "epoch": 0.12604, + "grad_norm": 0.7273903809849699, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 12604 + }, + { + "epoch": 0.12605, + "grad_norm": 0.827027777018654, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 12605 + }, + { + "epoch": 0.12606, + "grad_norm": 0.8312039632390354, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 12606 + }, + { + "epoch": 0.12607, + "grad_norm": 0.8300227267934658, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 12607 + }, + { + "epoch": 0.12608, + "grad_norm": 0.8334629282650833, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 12608 + }, + { + "epoch": 0.12609, + "grad_norm": 0.8244190523319734, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 12609 + }, + { + "epoch": 0.1261, + "grad_norm": 0.7931709246643678, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 12610 + }, + { + "epoch": 0.12611, + "grad_norm": 0.8401160267227625, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 12611 + }, + { + "epoch": 0.12612, + "grad_norm": 0.9085808627649283, + "learning_rate": 0.003, + "loss": 4.069, + "step": 12612 + }, + { + "epoch": 0.12613, + "grad_norm": 0.9173499827636392, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 12613 + }, + { + "epoch": 0.12614, + "grad_norm": 1.0897131711310648, + "learning_rate": 0.003, + "loss": 4.096, + "step": 12614 + }, + { + "epoch": 0.12615, + "grad_norm": 0.9364496572065395, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 12615 + }, + { + "epoch": 0.12616, + "grad_norm": 0.9667681960767411, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 12616 + }, + { + "epoch": 0.12617, + "grad_norm": 1.0303928702195748, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 12617 + }, + { + "epoch": 0.12618, + "grad_norm": 0.9436246602241444, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 12618 + }, + { + "epoch": 0.12619, + "grad_norm": 0.8890093706854717, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 12619 + }, + { + "epoch": 0.1262, + "grad_norm": 0.9333633955100752, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 12620 + }, + { + "epoch": 0.12621, + "grad_norm": 0.9322594488051535, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 12621 + }, + { + "epoch": 0.12622, + "grad_norm": 0.9033426657643931, + "learning_rate": 0.003, + "loss": 4.085, + "step": 12622 + }, + { + "epoch": 0.12623, + "grad_norm": 0.8078614711868852, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 12623 + }, + { + "epoch": 0.12624, + "grad_norm": 0.7553219433703763, + "learning_rate": 0.003, + "loss": 4.07, + "step": 12624 + }, + { + "epoch": 0.12625, + "grad_norm": 0.6536387376296591, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 12625 + }, + { + "epoch": 0.12626, + "grad_norm": 0.6509050382838308, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 12626 + }, + { + "epoch": 0.12627, + "grad_norm": 0.5894424757023002, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 12627 + }, + { + "epoch": 0.12628, + "grad_norm": 0.5190451966239275, + "learning_rate": 0.003, + "loss": 4.046, + "step": 12628 + }, + { + "epoch": 0.12629, + "grad_norm": 0.5063596654288052, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 12629 + }, + { + "epoch": 0.1263, + "grad_norm": 0.5915460424015676, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 12630 + }, + { + "epoch": 0.12631, + "grad_norm": 0.7716495330152952, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 12631 + }, + { + "epoch": 0.12632, + "grad_norm": 1.0127677704631437, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 12632 + }, + { + "epoch": 0.12633, + "grad_norm": 1.155960196618749, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 12633 + }, + { + "epoch": 0.12634, + "grad_norm": 0.6767776447411626, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 12634 + }, + { + "epoch": 0.12635, + "grad_norm": 0.6486610587694518, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 12635 + }, + { + "epoch": 0.12636, + "grad_norm": 0.9413168441927956, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 12636 + }, + { + "epoch": 0.12637, + "grad_norm": 1.1497099524548084, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 12637 + }, + { + "epoch": 0.12638, + "grad_norm": 0.841578834111071, + "learning_rate": 0.003, + "loss": 4.081, + "step": 12638 + }, + { + "epoch": 0.12639, + "grad_norm": 0.8511860535824317, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 12639 + }, + { + "epoch": 0.1264, + "grad_norm": 0.9587860814417453, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 12640 + }, + { + "epoch": 0.12641, + "grad_norm": 1.0399472699458243, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 12641 + }, + { + "epoch": 0.12642, + "grad_norm": 0.904514659473879, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 12642 + }, + { + "epoch": 0.12643, + "grad_norm": 0.8968843408912103, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 12643 + }, + { + "epoch": 0.12644, + "grad_norm": 0.9694460812122874, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 12644 + }, + { + "epoch": 0.12645, + "grad_norm": 1.1479583275761367, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 12645 + }, + { + "epoch": 0.12646, + "grad_norm": 0.9028210380164542, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 12646 + }, + { + "epoch": 0.12647, + "grad_norm": 0.8392746175735948, + "learning_rate": 0.003, + "loss": 4.068, + "step": 12647 + }, + { + "epoch": 0.12648, + "grad_norm": 0.9050343448598624, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 12648 + }, + { + "epoch": 0.12649, + "grad_norm": 1.0979731835477473, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 12649 + }, + { + "epoch": 0.1265, + "grad_norm": 0.9970749819263638, + "learning_rate": 0.003, + "loss": 4.1179, + "step": 12650 + }, + { + "epoch": 0.12651, + "grad_norm": 1.0300097240371, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 12651 + }, + { + "epoch": 0.12652, + "grad_norm": 0.8094128941912325, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 12652 + }, + { + "epoch": 0.12653, + "grad_norm": 0.8399627373921515, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 12653 + }, + { + "epoch": 0.12654, + "grad_norm": 0.9677214469576224, + "learning_rate": 0.003, + "loss": 4.079, + "step": 12654 + }, + { + "epoch": 0.12655, + "grad_norm": 0.8876434643959279, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 12655 + }, + { + "epoch": 0.12656, + "grad_norm": 0.7688755791017492, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 12656 + }, + { + "epoch": 0.12657, + "grad_norm": 0.7627679301180477, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 12657 + }, + { + "epoch": 0.12658, + "grad_norm": 0.7358996935502059, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 12658 + }, + { + "epoch": 0.12659, + "grad_norm": 0.777638381282783, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 12659 + }, + { + "epoch": 0.1266, + "grad_norm": 0.7263923280451818, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 12660 + }, + { + "epoch": 0.12661, + "grad_norm": 0.7179583472088668, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 12661 + }, + { + "epoch": 0.12662, + "grad_norm": 0.6501556131988983, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 12662 + }, + { + "epoch": 0.12663, + "grad_norm": 0.6406902271699736, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 12663 + }, + { + "epoch": 0.12664, + "grad_norm": 0.6080111377564112, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 12664 + }, + { + "epoch": 0.12665, + "grad_norm": 0.6036312883406694, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 12665 + }, + { + "epoch": 0.12666, + "grad_norm": 0.5768466033785397, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 12666 + }, + { + "epoch": 0.12667, + "grad_norm": 0.6576162442410906, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 12667 + }, + { + "epoch": 0.12668, + "grad_norm": 0.8097843106878837, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 12668 + }, + { + "epoch": 0.12669, + "grad_norm": 0.9462883400550094, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 12669 + }, + { + "epoch": 0.1267, + "grad_norm": 0.9608897301195578, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 12670 + }, + { + "epoch": 0.12671, + "grad_norm": 1.0340855127401694, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 12671 + }, + { + "epoch": 0.12672, + "grad_norm": 1.1531000426119538, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 12672 + }, + { + "epoch": 0.12673, + "grad_norm": 0.8261766238933164, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 12673 + }, + { + "epoch": 0.12674, + "grad_norm": 0.6920260802570728, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 12674 + }, + { + "epoch": 0.12675, + "grad_norm": 0.6335380850494294, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 12675 + }, + { + "epoch": 0.12676, + "grad_norm": 0.7209542624800619, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 12676 + }, + { + "epoch": 0.12677, + "grad_norm": 0.87268806862565, + "learning_rate": 0.003, + "loss": 4.049, + "step": 12677 + }, + { + "epoch": 0.12678, + "grad_norm": 0.8840380040948691, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 12678 + }, + { + "epoch": 0.12679, + "grad_norm": 0.9747808622851675, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 12679 + }, + { + "epoch": 0.1268, + "grad_norm": 1.063767639110145, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 12680 + }, + { + "epoch": 0.12681, + "grad_norm": 0.9301110174392209, + "learning_rate": 0.003, + "loss": 4.091, + "step": 12681 + }, + { + "epoch": 0.12682, + "grad_norm": 0.9316617602133014, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 12682 + }, + { + "epoch": 0.12683, + "grad_norm": 0.8621769611167813, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 12683 + }, + { + "epoch": 0.12684, + "grad_norm": 0.8619352180499619, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 12684 + }, + { + "epoch": 0.12685, + "grad_norm": 0.7818292664604228, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 12685 + }, + { + "epoch": 0.12686, + "grad_norm": 0.831157545940667, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 12686 + }, + { + "epoch": 0.12687, + "grad_norm": 0.9587347990085802, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 12687 + }, + { + "epoch": 0.12688, + "grad_norm": 1.0579182796949145, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 12688 + }, + { + "epoch": 0.12689, + "grad_norm": 0.9786164686866942, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 12689 + }, + { + "epoch": 0.1269, + "grad_norm": 0.863318584753949, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 12690 + }, + { + "epoch": 0.12691, + "grad_norm": 0.8304504158648132, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 12691 + }, + { + "epoch": 0.12692, + "grad_norm": 0.7904162445080669, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 12692 + }, + { + "epoch": 0.12693, + "grad_norm": 0.7862023906149539, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 12693 + }, + { + "epoch": 0.12694, + "grad_norm": 0.8898702567841869, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 12694 + }, + { + "epoch": 0.12695, + "grad_norm": 0.9731601117378544, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 12695 + }, + { + "epoch": 0.12696, + "grad_norm": 0.9838657840018509, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 12696 + }, + { + "epoch": 0.12697, + "grad_norm": 0.8410138578490889, + "learning_rate": 0.003, + "loss": 4.044, + "step": 12697 + }, + { + "epoch": 0.12698, + "grad_norm": 0.7824779930609662, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 12698 + }, + { + "epoch": 0.12699, + "grad_norm": 0.7325758562041346, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 12699 + }, + { + "epoch": 0.127, + "grad_norm": 0.7476038265054501, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 12700 + }, + { + "epoch": 0.12701, + "grad_norm": 0.6474234218140162, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 12701 + }, + { + "epoch": 0.12702, + "grad_norm": 0.5994708210940758, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 12702 + }, + { + "epoch": 0.12703, + "grad_norm": 0.5947004248947164, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 12703 + }, + { + "epoch": 0.12704, + "grad_norm": 0.5822643700856132, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 12704 + }, + { + "epoch": 0.12705, + "grad_norm": 0.6375257904131024, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 12705 + }, + { + "epoch": 0.12706, + "grad_norm": 0.8232573084375202, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 12706 + }, + { + "epoch": 0.12707, + "grad_norm": 1.0490864377683808, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 12707 + }, + { + "epoch": 0.12708, + "grad_norm": 0.9646458821564334, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 12708 + }, + { + "epoch": 0.12709, + "grad_norm": 0.9237631362442041, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 12709 + }, + { + "epoch": 0.1271, + "grad_norm": 1.1018179181554677, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 12710 + }, + { + "epoch": 0.12711, + "grad_norm": 1.0497471397953404, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 12711 + }, + { + "epoch": 0.12712, + "grad_norm": 0.9068318922074851, + "learning_rate": 0.003, + "loss": 4.066, + "step": 12712 + }, + { + "epoch": 0.12713, + "grad_norm": 0.8587292207217677, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 12713 + }, + { + "epoch": 0.12714, + "grad_norm": 0.7722901403512779, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 12714 + }, + { + "epoch": 0.12715, + "grad_norm": 0.6293681040156622, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 12715 + }, + { + "epoch": 0.12716, + "grad_norm": 0.6214351278871658, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 12716 + }, + { + "epoch": 0.12717, + "grad_norm": 0.6163124174025716, + "learning_rate": 0.003, + "loss": 4.083, + "step": 12717 + }, + { + "epoch": 0.12718, + "grad_norm": 0.7295122022742531, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 12718 + }, + { + "epoch": 0.12719, + "grad_norm": 0.7489060106943374, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 12719 + }, + { + "epoch": 0.1272, + "grad_norm": 0.8774515707694688, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 12720 + }, + { + "epoch": 0.12721, + "grad_norm": 1.107975365365161, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 12721 + }, + { + "epoch": 0.12722, + "grad_norm": 1.0814064376376955, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 12722 + }, + { + "epoch": 0.12723, + "grad_norm": 0.7852302860828565, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 12723 + }, + { + "epoch": 0.12724, + "grad_norm": 0.7165640244151582, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 12724 + }, + { + "epoch": 0.12725, + "grad_norm": 0.6870728656197633, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 12725 + }, + { + "epoch": 0.12726, + "grad_norm": 0.6875586946041499, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 12726 + }, + { + "epoch": 0.12727, + "grad_norm": 0.6534996603381649, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 12727 + }, + { + "epoch": 0.12728, + "grad_norm": 0.7220811580946503, + "learning_rate": 0.003, + "loss": 4.078, + "step": 12728 + }, + { + "epoch": 0.12729, + "grad_norm": 0.8525327543524897, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 12729 + }, + { + "epoch": 0.1273, + "grad_norm": 1.0524089976707018, + "learning_rate": 0.003, + "loss": 4.073, + "step": 12730 + }, + { + "epoch": 0.12731, + "grad_norm": 1.253081012855374, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 12731 + }, + { + "epoch": 0.12732, + "grad_norm": 0.7417151562374673, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 12732 + }, + { + "epoch": 0.12733, + "grad_norm": 0.7611871014784876, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 12733 + }, + { + "epoch": 0.12734, + "grad_norm": 0.8215105248523885, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 12734 + }, + { + "epoch": 0.12735, + "grad_norm": 0.8238174222975156, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 12735 + }, + { + "epoch": 0.12736, + "grad_norm": 0.8048380654708065, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 12736 + }, + { + "epoch": 0.12737, + "grad_norm": 0.71805881185348, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 12737 + }, + { + "epoch": 0.12738, + "grad_norm": 0.803432731777592, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 12738 + }, + { + "epoch": 0.12739, + "grad_norm": 0.7892230046494437, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 12739 + }, + { + "epoch": 0.1274, + "grad_norm": 0.8432138331722246, + "learning_rate": 0.003, + "loss": 4.067, + "step": 12740 + }, + { + "epoch": 0.12741, + "grad_norm": 0.8419812098722906, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 12741 + }, + { + "epoch": 0.12742, + "grad_norm": 0.9038260065352892, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 12742 + }, + { + "epoch": 0.12743, + "grad_norm": 0.9751762842945431, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 12743 + }, + { + "epoch": 0.12744, + "grad_norm": 0.9720053831555276, + "learning_rate": 0.003, + "loss": 4.066, + "step": 12744 + }, + { + "epoch": 0.12745, + "grad_norm": 0.9544614546080166, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 12745 + }, + { + "epoch": 0.12746, + "grad_norm": 0.9481144845024555, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 12746 + }, + { + "epoch": 0.12747, + "grad_norm": 0.9534086282439148, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 12747 + }, + { + "epoch": 0.12748, + "grad_norm": 0.7921609079785911, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 12748 + }, + { + "epoch": 0.12749, + "grad_norm": 0.7341232190175854, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 12749 + }, + { + "epoch": 0.1275, + "grad_norm": 0.7934210242478213, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 12750 + }, + { + "epoch": 0.12751, + "grad_norm": 0.8361610882033421, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 12751 + }, + { + "epoch": 0.12752, + "grad_norm": 0.8533153979514542, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 12752 + }, + { + "epoch": 0.12753, + "grad_norm": 0.8429252720795538, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 12753 + }, + { + "epoch": 0.12754, + "grad_norm": 0.8936232536103126, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 12754 + }, + { + "epoch": 0.12755, + "grad_norm": 0.9624691229469716, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 12755 + }, + { + "epoch": 0.12756, + "grad_norm": 1.0371851035955135, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 12756 + }, + { + "epoch": 0.12757, + "grad_norm": 1.045476061843089, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 12757 + }, + { + "epoch": 0.12758, + "grad_norm": 1.07435159385966, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 12758 + }, + { + "epoch": 0.12759, + "grad_norm": 1.081798623831364, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 12759 + }, + { + "epoch": 0.1276, + "grad_norm": 0.9703479152295932, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 12760 + }, + { + "epoch": 0.12761, + "grad_norm": 1.0968494337077377, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 12761 + }, + { + "epoch": 0.12762, + "grad_norm": 0.9243545413233614, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 12762 + }, + { + "epoch": 0.12763, + "grad_norm": 0.9520404423737605, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 12763 + }, + { + "epoch": 0.12764, + "grad_norm": 0.9195906991550372, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 12764 + }, + { + "epoch": 0.12765, + "grad_norm": 0.9882817414585594, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 12765 + }, + { + "epoch": 0.12766, + "grad_norm": 1.0954239881165995, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 12766 + }, + { + "epoch": 0.12767, + "grad_norm": 0.9642797317457092, + "learning_rate": 0.003, + "loss": 4.089, + "step": 12767 + }, + { + "epoch": 0.12768, + "grad_norm": 0.9065007479346235, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 12768 + }, + { + "epoch": 0.12769, + "grad_norm": 0.8194405556326955, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 12769 + }, + { + "epoch": 0.1277, + "grad_norm": 0.6593942198315941, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 12770 + }, + { + "epoch": 0.12771, + "grad_norm": 0.6359885378281872, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 12771 + }, + { + "epoch": 0.12772, + "grad_norm": 0.5624414834893711, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 12772 + }, + { + "epoch": 0.12773, + "grad_norm": 0.5534065103167038, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 12773 + }, + { + "epoch": 0.12774, + "grad_norm": 0.5263871455787446, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 12774 + }, + { + "epoch": 0.12775, + "grad_norm": 0.5001264134923619, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 12775 + }, + { + "epoch": 0.12776, + "grad_norm": 0.4965004417569365, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 12776 + }, + { + "epoch": 0.12777, + "grad_norm": 0.6427303533809592, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 12777 + }, + { + "epoch": 0.12778, + "grad_norm": 0.827424161741527, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 12778 + }, + { + "epoch": 0.12779, + "grad_norm": 0.9720714915404918, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 12779 + }, + { + "epoch": 0.1278, + "grad_norm": 1.2273812453673385, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 12780 + }, + { + "epoch": 0.12781, + "grad_norm": 0.7903980992873841, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 12781 + }, + { + "epoch": 0.12782, + "grad_norm": 0.7933372670231379, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 12782 + }, + { + "epoch": 0.12783, + "grad_norm": 0.9056791732628652, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 12783 + }, + { + "epoch": 0.12784, + "grad_norm": 0.9379469704928732, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 12784 + }, + { + "epoch": 0.12785, + "grad_norm": 0.8312325160204573, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 12785 + }, + { + "epoch": 0.12786, + "grad_norm": 0.811516624713473, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 12786 + }, + { + "epoch": 0.12787, + "grad_norm": 0.8151714149466239, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 12787 + }, + { + "epoch": 0.12788, + "grad_norm": 0.9047057444695532, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 12788 + }, + { + "epoch": 0.12789, + "grad_norm": 1.0056745263984743, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 12789 + }, + { + "epoch": 0.1279, + "grad_norm": 1.007074405841303, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 12790 + }, + { + "epoch": 0.12791, + "grad_norm": 0.9098293337743794, + "learning_rate": 0.003, + "loss": 4.086, + "step": 12791 + }, + { + "epoch": 0.12792, + "grad_norm": 0.8581978897830528, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 12792 + }, + { + "epoch": 0.12793, + "grad_norm": 0.7387993654938598, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 12793 + }, + { + "epoch": 0.12794, + "grad_norm": 0.8717184426974447, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 12794 + }, + { + "epoch": 0.12795, + "grad_norm": 0.9609144342761544, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 12795 + }, + { + "epoch": 0.12796, + "grad_norm": 0.9585935249627756, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 12796 + }, + { + "epoch": 0.12797, + "grad_norm": 0.8826724398226735, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 12797 + }, + { + "epoch": 0.12798, + "grad_norm": 0.7823401183104872, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 12798 + }, + { + "epoch": 0.12799, + "grad_norm": 0.768598197070406, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 12799 + }, + { + "epoch": 0.128, + "grad_norm": 0.7938414804643384, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 12800 + }, + { + "epoch": 0.12801, + "grad_norm": 0.9525671530334016, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 12801 + }, + { + "epoch": 0.12802, + "grad_norm": 1.13163526424453, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 12802 + }, + { + "epoch": 0.12803, + "grad_norm": 1.0032091422069624, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 12803 + }, + { + "epoch": 0.12804, + "grad_norm": 0.9508845934766839, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 12804 + }, + { + "epoch": 0.12805, + "grad_norm": 0.7860804655030508, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 12805 + }, + { + "epoch": 0.12806, + "grad_norm": 0.8292491651003593, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 12806 + }, + { + "epoch": 0.12807, + "grad_norm": 0.7773321265433382, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 12807 + }, + { + "epoch": 0.12808, + "grad_norm": 0.8264676701429482, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 12808 + }, + { + "epoch": 0.12809, + "grad_norm": 0.9778809971197301, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 12809 + }, + { + "epoch": 0.1281, + "grad_norm": 1.051517774945295, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 12810 + }, + { + "epoch": 0.12811, + "grad_norm": 0.8186785474711578, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 12811 + }, + { + "epoch": 0.12812, + "grad_norm": 0.7508226536859004, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 12812 + }, + { + "epoch": 0.12813, + "grad_norm": 0.6934791634966413, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 12813 + }, + { + "epoch": 0.12814, + "grad_norm": 0.7893377048543236, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 12814 + }, + { + "epoch": 0.12815, + "grad_norm": 0.8338115760468434, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 12815 + }, + { + "epoch": 0.12816, + "grad_norm": 0.8479740993884198, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 12816 + }, + { + "epoch": 0.12817, + "grad_norm": 0.9431496997904292, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 12817 + }, + { + "epoch": 0.12818, + "grad_norm": 0.8920237331550439, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 12818 + }, + { + "epoch": 0.12819, + "grad_norm": 0.7156508642588407, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 12819 + }, + { + "epoch": 0.1282, + "grad_norm": 0.6066254421966263, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 12820 + }, + { + "epoch": 0.12821, + "grad_norm": 0.6351940607150885, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 12821 + }, + { + "epoch": 0.12822, + "grad_norm": 0.7034877260393382, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 12822 + }, + { + "epoch": 0.12823, + "grad_norm": 0.6347898413970317, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 12823 + }, + { + "epoch": 0.12824, + "grad_norm": 0.6241083299387901, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 12824 + }, + { + "epoch": 0.12825, + "grad_norm": 0.6756292702600858, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 12825 + }, + { + "epoch": 0.12826, + "grad_norm": 0.8197715713923135, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 12826 + }, + { + "epoch": 0.12827, + "grad_norm": 0.9710675544326205, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 12827 + }, + { + "epoch": 0.12828, + "grad_norm": 1.1480053353882405, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 12828 + }, + { + "epoch": 0.12829, + "grad_norm": 0.7644574256098923, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 12829 + }, + { + "epoch": 0.1283, + "grad_norm": 0.6388582020514061, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 12830 + }, + { + "epoch": 0.12831, + "grad_norm": 0.635757694719699, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 12831 + }, + { + "epoch": 0.12832, + "grad_norm": 0.5750405720226927, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 12832 + }, + { + "epoch": 0.12833, + "grad_norm": 0.6023010902518902, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 12833 + }, + { + "epoch": 0.12834, + "grad_norm": 0.6011194316530417, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 12834 + }, + { + "epoch": 0.12835, + "grad_norm": 0.638985093365683, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 12835 + }, + { + "epoch": 0.12836, + "grad_norm": 0.777352893414973, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 12836 + }, + { + "epoch": 0.12837, + "grad_norm": 0.8026043171894651, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 12837 + }, + { + "epoch": 0.12838, + "grad_norm": 0.7360736772156009, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 12838 + }, + { + "epoch": 0.12839, + "grad_norm": 0.9227694794787703, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 12839 + }, + { + "epoch": 0.1284, + "grad_norm": 1.0628301651771763, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 12840 + }, + { + "epoch": 0.12841, + "grad_norm": 0.9214693039919348, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 12841 + }, + { + "epoch": 0.12842, + "grad_norm": 0.9447539388483064, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 12842 + }, + { + "epoch": 0.12843, + "grad_norm": 0.9482242552067807, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 12843 + }, + { + "epoch": 0.12844, + "grad_norm": 0.8759196059051805, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 12844 + }, + { + "epoch": 0.12845, + "grad_norm": 0.8253334642549143, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 12845 + }, + { + "epoch": 0.12846, + "grad_norm": 0.9249682151513382, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 12846 + }, + { + "epoch": 0.12847, + "grad_norm": 0.965513311820476, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 12847 + }, + { + "epoch": 0.12848, + "grad_norm": 0.9461560906880512, + "learning_rate": 0.003, + "loss": 4.095, + "step": 12848 + }, + { + "epoch": 0.12849, + "grad_norm": 1.0196975758139848, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 12849 + }, + { + "epoch": 0.1285, + "grad_norm": 1.3554759866006791, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 12850 + }, + { + "epoch": 0.12851, + "grad_norm": 0.8908069491735001, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 12851 + }, + { + "epoch": 0.12852, + "grad_norm": 0.8475096520827629, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 12852 + }, + { + "epoch": 0.12853, + "grad_norm": 0.8234717310938925, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 12853 + }, + { + "epoch": 0.12854, + "grad_norm": 0.7358780400837106, + "learning_rate": 0.003, + "loss": 4.101, + "step": 12854 + }, + { + "epoch": 0.12855, + "grad_norm": 0.743786573155452, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 12855 + }, + { + "epoch": 0.12856, + "grad_norm": 0.7386790569167223, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 12856 + }, + { + "epoch": 0.12857, + "grad_norm": 0.7129780781483097, + "learning_rate": 0.003, + "loss": 4.057, + "step": 12857 + }, + { + "epoch": 0.12858, + "grad_norm": 0.7341165024646048, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 12858 + }, + { + "epoch": 0.12859, + "grad_norm": 0.8805065977562269, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 12859 + }, + { + "epoch": 0.1286, + "grad_norm": 1.067801959326891, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 12860 + }, + { + "epoch": 0.12861, + "grad_norm": 1.1242566935397247, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 12861 + }, + { + "epoch": 0.12862, + "grad_norm": 0.9426431915569224, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 12862 + }, + { + "epoch": 0.12863, + "grad_norm": 0.8841812950302461, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 12863 + }, + { + "epoch": 0.12864, + "grad_norm": 0.9420783722109335, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 12864 + }, + { + "epoch": 0.12865, + "grad_norm": 0.9631534265418081, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 12865 + }, + { + "epoch": 0.12866, + "grad_norm": 0.8454715636945396, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 12866 + }, + { + "epoch": 0.12867, + "grad_norm": 0.9503572689770518, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 12867 + }, + { + "epoch": 0.12868, + "grad_norm": 0.9047058078312504, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 12868 + }, + { + "epoch": 0.12869, + "grad_norm": 0.7069135144597288, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 12869 + }, + { + "epoch": 0.1287, + "grad_norm": 0.7834958846660894, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 12870 + }, + { + "epoch": 0.12871, + "grad_norm": 0.8358942142934565, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 12871 + }, + { + "epoch": 0.12872, + "grad_norm": 1.027921339490834, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 12872 + }, + { + "epoch": 0.12873, + "grad_norm": 1.1544453029236919, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 12873 + }, + { + "epoch": 0.12874, + "grad_norm": 0.7813340075521686, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 12874 + }, + { + "epoch": 0.12875, + "grad_norm": 0.6801009675717281, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 12875 + }, + { + "epoch": 0.12876, + "grad_norm": 0.5836689953332413, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 12876 + }, + { + "epoch": 0.12877, + "grad_norm": 0.6791272029631606, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 12877 + }, + { + "epoch": 0.12878, + "grad_norm": 0.7048056118590715, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 12878 + }, + { + "epoch": 0.12879, + "grad_norm": 0.8486441486568741, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 12879 + }, + { + "epoch": 0.1288, + "grad_norm": 1.062333230871585, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 12880 + }, + { + "epoch": 0.12881, + "grad_norm": 1.1208324918485693, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 12881 + }, + { + "epoch": 0.12882, + "grad_norm": 0.7273735247448398, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 12882 + }, + { + "epoch": 0.12883, + "grad_norm": 0.6281737882036764, + "learning_rate": 0.003, + "loss": 4.075, + "step": 12883 + }, + { + "epoch": 0.12884, + "grad_norm": 0.7477414465891695, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 12884 + }, + { + "epoch": 0.12885, + "grad_norm": 0.7890015805595267, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 12885 + }, + { + "epoch": 0.12886, + "grad_norm": 0.7046100214749329, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 12886 + }, + { + "epoch": 0.12887, + "grad_norm": 0.7418100470213351, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 12887 + }, + { + "epoch": 0.12888, + "grad_norm": 0.8826632618108597, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 12888 + }, + { + "epoch": 0.12889, + "grad_norm": 1.0399927608543118, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 12889 + }, + { + "epoch": 0.1289, + "grad_norm": 0.9699092371767827, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 12890 + }, + { + "epoch": 0.12891, + "grad_norm": 0.8861813940068809, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 12891 + }, + { + "epoch": 0.12892, + "grad_norm": 0.933256377383019, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 12892 + }, + { + "epoch": 0.12893, + "grad_norm": 0.8541594423021338, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 12893 + }, + { + "epoch": 0.12894, + "grad_norm": 0.9481820783573804, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 12894 + }, + { + "epoch": 0.12895, + "grad_norm": 0.8663082137737302, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 12895 + }, + { + "epoch": 0.12896, + "grad_norm": 0.8678060328249163, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 12896 + }, + { + "epoch": 0.12897, + "grad_norm": 0.9375534317661987, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 12897 + }, + { + "epoch": 0.12898, + "grad_norm": 1.046933704977625, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 12898 + }, + { + "epoch": 0.12899, + "grad_norm": 0.9438937247867584, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 12899 + }, + { + "epoch": 0.129, + "grad_norm": 0.8095108174709523, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 12900 + }, + { + "epoch": 0.12901, + "grad_norm": 0.8002431242863978, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 12901 + }, + { + "epoch": 0.12902, + "grad_norm": 0.8138692043697183, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 12902 + }, + { + "epoch": 0.12903, + "grad_norm": 0.8796085016606462, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 12903 + }, + { + "epoch": 0.12904, + "grad_norm": 0.9236062888538203, + "learning_rate": 0.003, + "loss": 4.046, + "step": 12904 + }, + { + "epoch": 0.12905, + "grad_norm": 0.8413774387686741, + "learning_rate": 0.003, + "loss": 4.068, + "step": 12905 + }, + { + "epoch": 0.12906, + "grad_norm": 0.9456752557198059, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 12906 + }, + { + "epoch": 0.12907, + "grad_norm": 0.9542059686757122, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 12907 + }, + { + "epoch": 0.12908, + "grad_norm": 0.8969952436447906, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 12908 + }, + { + "epoch": 0.12909, + "grad_norm": 0.8660752120290028, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 12909 + }, + { + "epoch": 0.1291, + "grad_norm": 0.7206792488668173, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 12910 + }, + { + "epoch": 0.12911, + "grad_norm": 0.5894989401822226, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 12911 + }, + { + "epoch": 0.12912, + "grad_norm": 0.6632744636774794, + "learning_rate": 0.003, + "loss": 4.045, + "step": 12912 + }, + { + "epoch": 0.12913, + "grad_norm": 0.7989619794857723, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 12913 + }, + { + "epoch": 0.12914, + "grad_norm": 1.1599328636317552, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 12914 + }, + { + "epoch": 0.12915, + "grad_norm": 1.1144397017046104, + "learning_rate": 0.003, + "loss": 4.055, + "step": 12915 + }, + { + "epoch": 0.12916, + "grad_norm": 0.7532191660574785, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 12916 + }, + { + "epoch": 0.12917, + "grad_norm": 0.5736646240015582, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 12917 + }, + { + "epoch": 0.12918, + "grad_norm": 0.6677992780705447, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 12918 + }, + { + "epoch": 0.12919, + "grad_norm": 0.8075768288501416, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 12919 + }, + { + "epoch": 0.1292, + "grad_norm": 0.9430240478196712, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 12920 + }, + { + "epoch": 0.12921, + "grad_norm": 0.8289275041982015, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 12921 + }, + { + "epoch": 0.12922, + "grad_norm": 0.761314826384689, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 12922 + }, + { + "epoch": 0.12923, + "grad_norm": 0.8013300239881165, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 12923 + }, + { + "epoch": 0.12924, + "grad_norm": 0.7535540996983051, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 12924 + }, + { + "epoch": 0.12925, + "grad_norm": 0.7337997334087438, + "learning_rate": 0.003, + "loss": 4.071, + "step": 12925 + }, + { + "epoch": 0.12926, + "grad_norm": 0.6758098417641483, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 12926 + }, + { + "epoch": 0.12927, + "grad_norm": 0.6646773609405928, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 12927 + }, + { + "epoch": 0.12928, + "grad_norm": 0.7758042765328156, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 12928 + }, + { + "epoch": 0.12929, + "grad_norm": 0.9014493018144143, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 12929 + }, + { + "epoch": 0.1293, + "grad_norm": 0.9069207676053608, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 12930 + }, + { + "epoch": 0.12931, + "grad_norm": 0.8762834510823424, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 12931 + }, + { + "epoch": 0.12932, + "grad_norm": 0.9091437078713948, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 12932 + }, + { + "epoch": 0.12933, + "grad_norm": 0.8633790914666007, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 12933 + }, + { + "epoch": 0.12934, + "grad_norm": 0.8055331303781942, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 12934 + }, + { + "epoch": 0.12935, + "grad_norm": 0.8179019459764888, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 12935 + }, + { + "epoch": 0.12936, + "grad_norm": 0.7862144587775611, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 12936 + }, + { + "epoch": 0.12937, + "grad_norm": 0.8099260534544216, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 12937 + }, + { + "epoch": 0.12938, + "grad_norm": 0.8650704844461392, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 12938 + }, + { + "epoch": 0.12939, + "grad_norm": 0.8769552795770255, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 12939 + }, + { + "epoch": 0.1294, + "grad_norm": 1.1069886038868555, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 12940 + }, + { + "epoch": 0.12941, + "grad_norm": 0.8768949926516877, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 12941 + }, + { + "epoch": 0.12942, + "grad_norm": 0.7580148325910523, + "learning_rate": 0.003, + "loss": 4.06, + "step": 12942 + }, + { + "epoch": 0.12943, + "grad_norm": 0.766777102968796, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 12943 + }, + { + "epoch": 0.12944, + "grad_norm": 0.7366808448795482, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 12944 + }, + { + "epoch": 0.12945, + "grad_norm": 0.680775920463159, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 12945 + }, + { + "epoch": 0.12946, + "grad_norm": 0.8120383621936467, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 12946 + }, + { + "epoch": 0.12947, + "grad_norm": 1.0223654037160204, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 12947 + }, + { + "epoch": 0.12948, + "grad_norm": 1.169831218843661, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 12948 + }, + { + "epoch": 0.12949, + "grad_norm": 0.8200417608046442, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 12949 + }, + { + "epoch": 0.1295, + "grad_norm": 0.7819459739999236, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 12950 + }, + { + "epoch": 0.12951, + "grad_norm": 0.8241898175736854, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 12951 + }, + { + "epoch": 0.12952, + "grad_norm": 0.7630068419542384, + "learning_rate": 0.003, + "loss": 4.062, + "step": 12952 + }, + { + "epoch": 0.12953, + "grad_norm": 0.7052265617346959, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 12953 + }, + { + "epoch": 0.12954, + "grad_norm": 0.6481834232325121, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 12954 + }, + { + "epoch": 0.12955, + "grad_norm": 0.5027166237042523, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 12955 + }, + { + "epoch": 0.12956, + "grad_norm": 0.6150068430572803, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 12956 + }, + { + "epoch": 0.12957, + "grad_norm": 0.7659736446842879, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 12957 + }, + { + "epoch": 0.12958, + "grad_norm": 0.9386359161422575, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 12958 + }, + { + "epoch": 0.12959, + "grad_norm": 1.0537360487182426, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 12959 + }, + { + "epoch": 0.1296, + "grad_norm": 0.861816070614856, + "learning_rate": 0.003, + "loss": 4.065, + "step": 12960 + }, + { + "epoch": 0.12961, + "grad_norm": 0.7717231801413187, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 12961 + }, + { + "epoch": 0.12962, + "grad_norm": 0.6888662908134324, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 12962 + }, + { + "epoch": 0.12963, + "grad_norm": 0.6496168495428045, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 12963 + }, + { + "epoch": 0.12964, + "grad_norm": 0.7498354228466765, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 12964 + }, + { + "epoch": 0.12965, + "grad_norm": 0.8796226972801704, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 12965 + }, + { + "epoch": 0.12966, + "grad_norm": 0.9162752314847318, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 12966 + }, + { + "epoch": 0.12967, + "grad_norm": 0.949790235599875, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 12967 + }, + { + "epoch": 0.12968, + "grad_norm": 0.9381701683390344, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 12968 + }, + { + "epoch": 0.12969, + "grad_norm": 1.0926731768455018, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 12969 + }, + { + "epoch": 0.1297, + "grad_norm": 0.8945847611525154, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 12970 + }, + { + "epoch": 0.12971, + "grad_norm": 0.8190152391461676, + "learning_rate": 0.003, + "loss": 4.068, + "step": 12971 + }, + { + "epoch": 0.12972, + "grad_norm": 0.7558570658616437, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 12972 + }, + { + "epoch": 0.12973, + "grad_norm": 0.8469493343029718, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 12973 + }, + { + "epoch": 0.12974, + "grad_norm": 1.0091381673217301, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 12974 + }, + { + "epoch": 0.12975, + "grad_norm": 1.1879356593057941, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 12975 + }, + { + "epoch": 0.12976, + "grad_norm": 1.0384012763491532, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 12976 + }, + { + "epoch": 0.12977, + "grad_norm": 1.0360118231586506, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 12977 + }, + { + "epoch": 0.12978, + "grad_norm": 1.0249444072596572, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 12978 + }, + { + "epoch": 0.12979, + "grad_norm": 1.091963358540554, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 12979 + }, + { + "epoch": 0.1298, + "grad_norm": 0.889879935839374, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 12980 + }, + { + "epoch": 0.12981, + "grad_norm": 0.860354677563818, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 12981 + }, + { + "epoch": 0.12982, + "grad_norm": 0.9381865988928322, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 12982 + }, + { + "epoch": 0.12983, + "grad_norm": 1.0938494344300176, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 12983 + }, + { + "epoch": 0.12984, + "grad_norm": 0.9402280239919303, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 12984 + }, + { + "epoch": 0.12985, + "grad_norm": 0.8944998677862985, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 12985 + }, + { + "epoch": 0.12986, + "grad_norm": 0.8105911122210153, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 12986 + }, + { + "epoch": 0.12987, + "grad_norm": 0.9400223837798752, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 12987 + }, + { + "epoch": 0.12988, + "grad_norm": 1.1004084188466887, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 12988 + }, + { + "epoch": 0.12989, + "grad_norm": 0.9366609708033388, + "learning_rate": 0.003, + "loss": 4.084, + "step": 12989 + }, + { + "epoch": 0.1299, + "grad_norm": 1.0336862582703732, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 12990 + }, + { + "epoch": 0.12991, + "grad_norm": 0.9608165821353573, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 12991 + }, + { + "epoch": 0.12992, + "grad_norm": 0.9952602100669985, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 12992 + }, + { + "epoch": 0.12993, + "grad_norm": 1.0227492748641744, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 12993 + }, + { + "epoch": 0.12994, + "grad_norm": 0.7880943350209298, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 12994 + }, + { + "epoch": 0.12995, + "grad_norm": 0.790431399092981, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 12995 + }, + { + "epoch": 0.12996, + "grad_norm": 0.9321800086828976, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 12996 + }, + { + "epoch": 0.12997, + "grad_norm": 0.8615839377992883, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 12997 + }, + { + "epoch": 0.12998, + "grad_norm": 0.9067442775911201, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 12998 + }, + { + "epoch": 0.12999, + "grad_norm": 0.8373932866515066, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 12999 + }, + { + "epoch": 0.13, + "grad_norm": 0.8540315642567873, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 13000 + }, + { + "epoch": 0.13001, + "grad_norm": 0.9359698174485682, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 13001 + }, + { + "epoch": 0.13002, + "grad_norm": 0.9592830153337547, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 13002 + }, + { + "epoch": 0.13003, + "grad_norm": 1.046448617999635, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 13003 + }, + { + "epoch": 0.13004, + "grad_norm": 1.1041883637967569, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 13004 + }, + { + "epoch": 0.13005, + "grad_norm": 1.0369515453918619, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 13005 + }, + { + "epoch": 0.13006, + "grad_norm": 0.9979735094558032, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 13006 + }, + { + "epoch": 0.13007, + "grad_norm": 1.0611639538018411, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 13007 + }, + { + "epoch": 0.13008, + "grad_norm": 0.8201878202049163, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 13008 + }, + { + "epoch": 0.13009, + "grad_norm": 0.7118108357889752, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 13009 + }, + { + "epoch": 0.1301, + "grad_norm": 0.6629440697703289, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 13010 + }, + { + "epoch": 0.13011, + "grad_norm": 0.6007149610370014, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 13011 + }, + { + "epoch": 0.13012, + "grad_norm": 0.6921546080483328, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 13012 + }, + { + "epoch": 0.13013, + "grad_norm": 0.7107746587669299, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 13013 + }, + { + "epoch": 0.13014, + "grad_norm": 0.6984294373117751, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 13014 + }, + { + "epoch": 0.13015, + "grad_norm": 0.6789059948176281, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 13015 + }, + { + "epoch": 0.13016, + "grad_norm": 0.6844927291604583, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 13016 + }, + { + "epoch": 0.13017, + "grad_norm": 0.6431965019570215, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 13017 + }, + { + "epoch": 0.13018, + "grad_norm": 0.6011039426413707, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 13018 + }, + { + "epoch": 0.13019, + "grad_norm": 0.5103307281775522, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 13019 + }, + { + "epoch": 0.1302, + "grad_norm": 0.5353798392896162, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 13020 + }, + { + "epoch": 0.13021, + "grad_norm": 0.548977835450416, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 13021 + }, + { + "epoch": 0.13022, + "grad_norm": 0.5161434070433679, + "learning_rate": 0.003, + "loss": 4.078, + "step": 13022 + }, + { + "epoch": 0.13023, + "grad_norm": 0.5269826526927525, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 13023 + }, + { + "epoch": 0.13024, + "grad_norm": 0.6259089187899785, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 13024 + }, + { + "epoch": 0.13025, + "grad_norm": 0.8800244285647391, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 13025 + }, + { + "epoch": 0.13026, + "grad_norm": 1.2491259496586147, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 13026 + }, + { + "epoch": 0.13027, + "grad_norm": 0.9315497999331711, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 13027 + }, + { + "epoch": 0.13028, + "grad_norm": 0.8417572856152467, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 13028 + }, + { + "epoch": 0.13029, + "grad_norm": 0.8742979611520736, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 13029 + }, + { + "epoch": 0.1303, + "grad_norm": 0.7711651552812404, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 13030 + }, + { + "epoch": 0.13031, + "grad_norm": 0.803693913087942, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 13031 + }, + { + "epoch": 0.13032, + "grad_norm": 0.9716408104367728, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 13032 + }, + { + "epoch": 0.13033, + "grad_norm": 1.066738321773958, + "learning_rate": 0.003, + "loss": 4.067, + "step": 13033 + }, + { + "epoch": 0.13034, + "grad_norm": 1.064298193459379, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 13034 + }, + { + "epoch": 0.13035, + "grad_norm": 0.8019257374761538, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 13035 + }, + { + "epoch": 0.13036, + "grad_norm": 0.845083701172066, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 13036 + }, + { + "epoch": 0.13037, + "grad_norm": 0.8738824292069386, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 13037 + }, + { + "epoch": 0.13038, + "grad_norm": 0.861807617539825, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 13038 + }, + { + "epoch": 0.13039, + "grad_norm": 0.984947554147437, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 13039 + }, + { + "epoch": 0.1304, + "grad_norm": 1.0049046163421724, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 13040 + }, + { + "epoch": 0.13041, + "grad_norm": 1.0052885994190504, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 13041 + }, + { + "epoch": 0.13042, + "grad_norm": 0.9684938282207431, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 13042 + }, + { + "epoch": 0.13043, + "grad_norm": 0.9184514064066112, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 13043 + }, + { + "epoch": 0.13044, + "grad_norm": 0.8827308505679203, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 13044 + }, + { + "epoch": 0.13045, + "grad_norm": 0.7717593571762313, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 13045 + }, + { + "epoch": 0.13046, + "grad_norm": 0.8679082897595768, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 13046 + }, + { + "epoch": 0.13047, + "grad_norm": 0.8309256501054642, + "learning_rate": 0.003, + "loss": 4.085, + "step": 13047 + }, + { + "epoch": 0.13048, + "grad_norm": 0.8377738423423138, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 13048 + }, + { + "epoch": 0.13049, + "grad_norm": 0.7119157355427475, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 13049 + }, + { + "epoch": 0.1305, + "grad_norm": 0.6521149593515434, + "learning_rate": 0.003, + "loss": 4.045, + "step": 13050 + }, + { + "epoch": 0.13051, + "grad_norm": 0.6187860253881744, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 13051 + }, + { + "epoch": 0.13052, + "grad_norm": 0.721820839150363, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 13052 + }, + { + "epoch": 0.13053, + "grad_norm": 0.881763567897137, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 13053 + }, + { + "epoch": 0.13054, + "grad_norm": 0.9884496746256645, + "learning_rate": 0.003, + "loss": 4.093, + "step": 13054 + }, + { + "epoch": 0.13055, + "grad_norm": 1.145426405822364, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 13055 + }, + { + "epoch": 0.13056, + "grad_norm": 0.9911722617006568, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 13056 + }, + { + "epoch": 0.13057, + "grad_norm": 1.0815519581620563, + "learning_rate": 0.003, + "loss": 4.096, + "step": 13057 + }, + { + "epoch": 0.13058, + "grad_norm": 1.1161554753926237, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 13058 + }, + { + "epoch": 0.13059, + "grad_norm": 1.0353923070365907, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 13059 + }, + { + "epoch": 0.1306, + "grad_norm": 1.0491545620726093, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 13060 + }, + { + "epoch": 0.13061, + "grad_norm": 0.9176883461055413, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 13061 + }, + { + "epoch": 0.13062, + "grad_norm": 0.9654262517848932, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 13062 + }, + { + "epoch": 0.13063, + "grad_norm": 0.9733430114778608, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 13063 + }, + { + "epoch": 0.13064, + "grad_norm": 0.9159542932584501, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 13064 + }, + { + "epoch": 0.13065, + "grad_norm": 0.9137437128996506, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 13065 + }, + { + "epoch": 0.13066, + "grad_norm": 0.9965392933215164, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 13066 + }, + { + "epoch": 0.13067, + "grad_norm": 1.0799242888190321, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 13067 + }, + { + "epoch": 0.13068, + "grad_norm": 0.8908478738348087, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 13068 + }, + { + "epoch": 0.13069, + "grad_norm": 0.8505816538765397, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 13069 + }, + { + "epoch": 0.1307, + "grad_norm": 0.7138405962694484, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 13070 + }, + { + "epoch": 0.13071, + "grad_norm": 0.6344437386784378, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 13071 + }, + { + "epoch": 0.13072, + "grad_norm": 0.5760631033472555, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 13072 + }, + { + "epoch": 0.13073, + "grad_norm": 0.5379709264467936, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 13073 + }, + { + "epoch": 0.13074, + "grad_norm": 0.578691080934062, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 13074 + }, + { + "epoch": 0.13075, + "grad_norm": 0.609909360116078, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 13075 + }, + { + "epoch": 0.13076, + "grad_norm": 0.6066412195555408, + "learning_rate": 0.003, + "loss": 4.079, + "step": 13076 + }, + { + "epoch": 0.13077, + "grad_norm": 0.6342014969921537, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 13077 + }, + { + "epoch": 0.13078, + "grad_norm": 0.7264191472838323, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 13078 + }, + { + "epoch": 0.13079, + "grad_norm": 0.6417007823997498, + "learning_rate": 0.003, + "loss": 4.038, + "step": 13079 + }, + { + "epoch": 0.1308, + "grad_norm": 0.6143304071375301, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 13080 + }, + { + "epoch": 0.13081, + "grad_norm": 0.6149109507383749, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 13081 + }, + { + "epoch": 0.13082, + "grad_norm": 0.5095949171740436, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 13082 + }, + { + "epoch": 0.13083, + "grad_norm": 0.62434362937003, + "learning_rate": 0.003, + "loss": 4.054, + "step": 13083 + }, + { + "epoch": 0.13084, + "grad_norm": 0.8406162652846959, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 13084 + }, + { + "epoch": 0.13085, + "grad_norm": 1.2752997785470583, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 13085 + }, + { + "epoch": 0.13086, + "grad_norm": 0.8614518385858537, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 13086 + }, + { + "epoch": 0.13087, + "grad_norm": 0.6625540584171613, + "learning_rate": 0.003, + "loss": 4.051, + "step": 13087 + }, + { + "epoch": 0.13088, + "grad_norm": 0.6474896194729026, + "learning_rate": 0.003, + "loss": 4.072, + "step": 13088 + }, + { + "epoch": 0.13089, + "grad_norm": 0.7318139577371409, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 13089 + }, + { + "epoch": 0.1309, + "grad_norm": 0.8177182020866904, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 13090 + }, + { + "epoch": 0.13091, + "grad_norm": 0.7604785102669028, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 13091 + }, + { + "epoch": 0.13092, + "grad_norm": 0.6706188706026742, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 13092 + }, + { + "epoch": 0.13093, + "grad_norm": 0.7083463370726262, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 13093 + }, + { + "epoch": 0.13094, + "grad_norm": 0.8246775184543369, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 13094 + }, + { + "epoch": 0.13095, + "grad_norm": 0.8959539616378489, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 13095 + }, + { + "epoch": 0.13096, + "grad_norm": 1.1426664720440958, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 13096 + }, + { + "epoch": 0.13097, + "grad_norm": 0.9314302334293945, + "learning_rate": 0.003, + "loss": 4.092, + "step": 13097 + }, + { + "epoch": 0.13098, + "grad_norm": 0.8097032024710651, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 13098 + }, + { + "epoch": 0.13099, + "grad_norm": 0.9162781554443832, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 13099 + }, + { + "epoch": 0.131, + "grad_norm": 0.9343947726975026, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 13100 + }, + { + "epoch": 0.13101, + "grad_norm": 0.7942321020277784, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 13101 + }, + { + "epoch": 0.13102, + "grad_norm": 0.9429946702699512, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 13102 + }, + { + "epoch": 0.13103, + "grad_norm": 0.943910967721569, + "learning_rate": 0.003, + "loss": 4.095, + "step": 13103 + }, + { + "epoch": 0.13104, + "grad_norm": 0.99596788147382, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 13104 + }, + { + "epoch": 0.13105, + "grad_norm": 1.122752514585132, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 13105 + }, + { + "epoch": 0.13106, + "grad_norm": 0.9153266233651645, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 13106 + }, + { + "epoch": 0.13107, + "grad_norm": 0.8791341139465975, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 13107 + }, + { + "epoch": 0.13108, + "grad_norm": 0.8672799144422796, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 13108 + }, + { + "epoch": 0.13109, + "grad_norm": 0.7203451203721, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 13109 + }, + { + "epoch": 0.1311, + "grad_norm": 0.6602568170161984, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 13110 + }, + { + "epoch": 0.13111, + "grad_norm": 0.7527357657881135, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 13111 + }, + { + "epoch": 0.13112, + "grad_norm": 1.1102494497800772, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 13112 + }, + { + "epoch": 0.13113, + "grad_norm": 1.0245672109883843, + "learning_rate": 0.003, + "loss": 4.1131, + "step": 13113 + }, + { + "epoch": 0.13114, + "grad_norm": 1.0114187315722039, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 13114 + }, + { + "epoch": 0.13115, + "grad_norm": 1.1803051995778815, + "learning_rate": 0.003, + "loss": 4.1, + "step": 13115 + }, + { + "epoch": 0.13116, + "grad_norm": 0.7990502832944206, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 13116 + }, + { + "epoch": 0.13117, + "grad_norm": 0.7845891788833669, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 13117 + }, + { + "epoch": 0.13118, + "grad_norm": 0.895614630018304, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 13118 + }, + { + "epoch": 0.13119, + "grad_norm": 0.9073412439270881, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 13119 + }, + { + "epoch": 0.1312, + "grad_norm": 0.8443048639033641, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 13120 + }, + { + "epoch": 0.13121, + "grad_norm": 0.841496267158033, + "learning_rate": 0.003, + "loss": 4.073, + "step": 13121 + }, + { + "epoch": 0.13122, + "grad_norm": 0.9764370092772551, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 13122 + }, + { + "epoch": 0.13123, + "grad_norm": 0.9963997793009255, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 13123 + }, + { + "epoch": 0.13124, + "grad_norm": 0.9514972446333433, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 13124 + }, + { + "epoch": 0.13125, + "grad_norm": 0.8162910350678603, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 13125 + }, + { + "epoch": 0.13126, + "grad_norm": 0.8469684407231409, + "learning_rate": 0.003, + "loss": 4.087, + "step": 13126 + }, + { + "epoch": 0.13127, + "grad_norm": 0.9282328198645152, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 13127 + }, + { + "epoch": 0.13128, + "grad_norm": 1.0681435375555972, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 13128 + }, + { + "epoch": 0.13129, + "grad_norm": 1.0618651926043272, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 13129 + }, + { + "epoch": 0.1313, + "grad_norm": 0.9039759339251511, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 13130 + }, + { + "epoch": 0.13131, + "grad_norm": 0.8369774349939699, + "learning_rate": 0.003, + "loss": 4.09, + "step": 13131 + }, + { + "epoch": 0.13132, + "grad_norm": 0.745261133516445, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 13132 + }, + { + "epoch": 0.13133, + "grad_norm": 0.75806078975999, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 13133 + }, + { + "epoch": 0.13134, + "grad_norm": 0.8801813665835895, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 13134 + }, + { + "epoch": 0.13135, + "grad_norm": 0.969489240654843, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 13135 + }, + { + "epoch": 0.13136, + "grad_norm": 1.1686724945289257, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 13136 + }, + { + "epoch": 0.13137, + "grad_norm": 0.9697056041396325, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 13137 + }, + { + "epoch": 0.13138, + "grad_norm": 1.0511729522703308, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 13138 + }, + { + "epoch": 0.13139, + "grad_norm": 0.9425962853681444, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 13139 + }, + { + "epoch": 0.1314, + "grad_norm": 0.856047400876727, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 13140 + }, + { + "epoch": 0.13141, + "grad_norm": 0.86737845061535, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 13141 + }, + { + "epoch": 0.13142, + "grad_norm": 0.7888503334575826, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 13142 + }, + { + "epoch": 0.13143, + "grad_norm": 0.7754088984735394, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 13143 + }, + { + "epoch": 0.13144, + "grad_norm": 0.6275481544428124, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 13144 + }, + { + "epoch": 0.13145, + "grad_norm": 0.5847896020574547, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 13145 + }, + { + "epoch": 0.13146, + "grad_norm": 0.5082445841466955, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 13146 + }, + { + "epoch": 0.13147, + "grad_norm": 0.5455089289145286, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 13147 + }, + { + "epoch": 0.13148, + "grad_norm": 0.5665523262959993, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 13148 + }, + { + "epoch": 0.13149, + "grad_norm": 0.7012564519777836, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 13149 + }, + { + "epoch": 0.1315, + "grad_norm": 0.9812983457873712, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 13150 + }, + { + "epoch": 0.13151, + "grad_norm": 1.3901380597883277, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 13151 + }, + { + "epoch": 0.13152, + "grad_norm": 0.49944231895829955, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 13152 + }, + { + "epoch": 0.13153, + "grad_norm": 0.8671324059203369, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 13153 + }, + { + "epoch": 0.13154, + "grad_norm": 1.1300493237968723, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 13154 + }, + { + "epoch": 0.13155, + "grad_norm": 0.7987944954842804, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 13155 + }, + { + "epoch": 0.13156, + "grad_norm": 0.6761337520479488, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 13156 + }, + { + "epoch": 0.13157, + "grad_norm": 0.6596813875344903, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 13157 + }, + { + "epoch": 0.13158, + "grad_norm": 0.5263151854628152, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 13158 + }, + { + "epoch": 0.13159, + "grad_norm": 0.5402213954657354, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 13159 + }, + { + "epoch": 0.1316, + "grad_norm": 0.6789202818058099, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 13160 + }, + { + "epoch": 0.13161, + "grad_norm": 0.6385281448634009, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 13161 + }, + { + "epoch": 0.13162, + "grad_norm": 0.601959584583655, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 13162 + }, + { + "epoch": 0.13163, + "grad_norm": 0.6117058188699801, + "learning_rate": 0.003, + "loss": 4.104, + "step": 13163 + }, + { + "epoch": 0.13164, + "grad_norm": 0.725438513292309, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 13164 + }, + { + "epoch": 0.13165, + "grad_norm": 0.867140457954247, + "learning_rate": 0.003, + "loss": 4.077, + "step": 13165 + }, + { + "epoch": 0.13166, + "grad_norm": 1.006111367922786, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 13166 + }, + { + "epoch": 0.13167, + "grad_norm": 1.187665915597781, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 13167 + }, + { + "epoch": 0.13168, + "grad_norm": 0.9338917443860894, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 13168 + }, + { + "epoch": 0.13169, + "grad_norm": 0.9536661443703157, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 13169 + }, + { + "epoch": 0.1317, + "grad_norm": 0.9511285463417876, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 13170 + }, + { + "epoch": 0.13171, + "grad_norm": 0.9164751396731089, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 13171 + }, + { + "epoch": 0.13172, + "grad_norm": 0.9934512041007826, + "learning_rate": 0.003, + "loss": 4.071, + "step": 13172 + }, + { + "epoch": 0.13173, + "grad_norm": 1.053517198022562, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 13173 + }, + { + "epoch": 0.13174, + "grad_norm": 1.0092202746747236, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 13174 + }, + { + "epoch": 0.13175, + "grad_norm": 1.2267742892670814, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 13175 + }, + { + "epoch": 0.13176, + "grad_norm": 0.9811606246202388, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 13176 + }, + { + "epoch": 0.13177, + "grad_norm": 0.8342770824928509, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 13177 + }, + { + "epoch": 0.13178, + "grad_norm": 0.8419320414717685, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 13178 + }, + { + "epoch": 0.13179, + "grad_norm": 0.8048464154508961, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 13179 + }, + { + "epoch": 0.1318, + "grad_norm": 0.757893781766694, + "learning_rate": 0.003, + "loss": 4.067, + "step": 13180 + }, + { + "epoch": 0.13181, + "grad_norm": 0.6878957941523194, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 13181 + }, + { + "epoch": 0.13182, + "grad_norm": 0.668821590616849, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 13182 + }, + { + "epoch": 0.13183, + "grad_norm": 0.9340885385093695, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 13183 + }, + { + "epoch": 0.13184, + "grad_norm": 1.1274754922823589, + "learning_rate": 0.003, + "loss": 4.085, + "step": 13184 + }, + { + "epoch": 0.13185, + "grad_norm": 0.9331556957925303, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 13185 + }, + { + "epoch": 0.13186, + "grad_norm": 0.8216673544501654, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 13186 + }, + { + "epoch": 0.13187, + "grad_norm": 0.7140573227719084, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 13187 + }, + { + "epoch": 0.13188, + "grad_norm": 0.6069945344851568, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 13188 + }, + { + "epoch": 0.13189, + "grad_norm": 0.6018208352122972, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 13189 + }, + { + "epoch": 0.1319, + "grad_norm": 0.563879967962961, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 13190 + }, + { + "epoch": 0.13191, + "grad_norm": 0.6599430768938669, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 13191 + }, + { + "epoch": 0.13192, + "grad_norm": 0.8184650611546166, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 13192 + }, + { + "epoch": 0.13193, + "grad_norm": 0.881523759877944, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 13193 + }, + { + "epoch": 0.13194, + "grad_norm": 0.9461369100877264, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 13194 + }, + { + "epoch": 0.13195, + "grad_norm": 1.153724315123479, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 13195 + }, + { + "epoch": 0.13196, + "grad_norm": 1.033498270848711, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 13196 + }, + { + "epoch": 0.13197, + "grad_norm": 0.9548637052073567, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 13197 + }, + { + "epoch": 0.13198, + "grad_norm": 0.8518694067242146, + "learning_rate": 0.003, + "loss": 4.075, + "step": 13198 + }, + { + "epoch": 0.13199, + "grad_norm": 0.9343486281432397, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 13199 + }, + { + "epoch": 0.132, + "grad_norm": 0.9524390913324494, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 13200 + }, + { + "epoch": 0.13201, + "grad_norm": 0.913308437512326, + "learning_rate": 0.003, + "loss": 4.057, + "step": 13201 + }, + { + "epoch": 0.13202, + "grad_norm": 0.8916982587930954, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 13202 + }, + { + "epoch": 0.13203, + "grad_norm": 0.8518831238121933, + "learning_rate": 0.003, + "loss": 4.089, + "step": 13203 + }, + { + "epoch": 0.13204, + "grad_norm": 0.87098341177306, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 13204 + }, + { + "epoch": 0.13205, + "grad_norm": 0.7907988168737735, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 13205 + }, + { + "epoch": 0.13206, + "grad_norm": 0.6765463916777932, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 13206 + }, + { + "epoch": 0.13207, + "grad_norm": 0.6624752649612393, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 13207 + }, + { + "epoch": 0.13208, + "grad_norm": 0.6076925646094622, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 13208 + }, + { + "epoch": 0.13209, + "grad_norm": 0.7220498401414271, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 13209 + }, + { + "epoch": 0.1321, + "grad_norm": 0.8643535197737705, + "learning_rate": 0.003, + "loss": 4.115, + "step": 13210 + }, + { + "epoch": 0.13211, + "grad_norm": 1.1158427207401402, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 13211 + }, + { + "epoch": 0.13212, + "grad_norm": 1.211885219651439, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 13212 + }, + { + "epoch": 0.13213, + "grad_norm": 0.9472241994569061, + "learning_rate": 0.003, + "loss": 4.099, + "step": 13213 + }, + { + "epoch": 0.13214, + "grad_norm": 0.9278604297079831, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 13214 + }, + { + "epoch": 0.13215, + "grad_norm": 1.051499165609013, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 13215 + }, + { + "epoch": 0.13216, + "grad_norm": 0.9914093115313234, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 13216 + }, + { + "epoch": 0.13217, + "grad_norm": 1.081604203808225, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 13217 + }, + { + "epoch": 0.13218, + "grad_norm": 0.9413066234986573, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 13218 + }, + { + "epoch": 0.13219, + "grad_norm": 0.8790787588417616, + "learning_rate": 0.003, + "loss": 4.139, + "step": 13219 + }, + { + "epoch": 0.1322, + "grad_norm": 0.88143566929359, + "learning_rate": 0.003, + "loss": 4.1247, + "step": 13220 + }, + { + "epoch": 0.13221, + "grad_norm": 0.7886865446284567, + "learning_rate": 0.003, + "loss": 4.09, + "step": 13221 + }, + { + "epoch": 0.13222, + "grad_norm": 0.8703866688049373, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 13222 + }, + { + "epoch": 0.13223, + "grad_norm": 1.0610641583952305, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 13223 + }, + { + "epoch": 0.13224, + "grad_norm": 0.8877308142908692, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 13224 + }, + { + "epoch": 0.13225, + "grad_norm": 0.7923502126794982, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 13225 + }, + { + "epoch": 0.13226, + "grad_norm": 0.8037426063643605, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 13226 + }, + { + "epoch": 0.13227, + "grad_norm": 0.8298313561068729, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 13227 + }, + { + "epoch": 0.13228, + "grad_norm": 0.8313912515009332, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 13228 + }, + { + "epoch": 0.13229, + "grad_norm": 0.7641603605460101, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 13229 + }, + { + "epoch": 0.1323, + "grad_norm": 0.7234519928662755, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 13230 + }, + { + "epoch": 0.13231, + "grad_norm": 0.7013429757158008, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 13231 + }, + { + "epoch": 0.13232, + "grad_norm": 0.7272956344706222, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 13232 + }, + { + "epoch": 0.13233, + "grad_norm": 0.7018840386890066, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 13233 + }, + { + "epoch": 0.13234, + "grad_norm": 0.6483590280546878, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 13234 + }, + { + "epoch": 0.13235, + "grad_norm": 0.5960213433595248, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 13235 + }, + { + "epoch": 0.13236, + "grad_norm": 0.6362690194768988, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 13236 + }, + { + "epoch": 0.13237, + "grad_norm": 0.7016524575189277, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 13237 + }, + { + "epoch": 0.13238, + "grad_norm": 0.8818072070251921, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 13238 + }, + { + "epoch": 0.13239, + "grad_norm": 1.1771151101524453, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 13239 + }, + { + "epoch": 0.1324, + "grad_norm": 0.7848927790510807, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 13240 + }, + { + "epoch": 0.13241, + "grad_norm": 0.7294357892230204, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 13241 + }, + { + "epoch": 0.13242, + "grad_norm": 0.7922721302088672, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 13242 + }, + { + "epoch": 0.13243, + "grad_norm": 0.8779073015391228, + "learning_rate": 0.003, + "loss": 4.065, + "step": 13243 + }, + { + "epoch": 0.13244, + "grad_norm": 0.9028766978779963, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 13244 + }, + { + "epoch": 0.13245, + "grad_norm": 0.8846836016770767, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 13245 + }, + { + "epoch": 0.13246, + "grad_norm": 0.9073640025981549, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 13246 + }, + { + "epoch": 0.13247, + "grad_norm": 1.0184152912935793, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 13247 + }, + { + "epoch": 0.13248, + "grad_norm": 1.1201902379000934, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 13248 + }, + { + "epoch": 0.13249, + "grad_norm": 0.8199137014662327, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 13249 + }, + { + "epoch": 0.1325, + "grad_norm": 0.6982941709290479, + "learning_rate": 0.003, + "loss": 4.067, + "step": 13250 + }, + { + "epoch": 0.13251, + "grad_norm": 0.6265302384897115, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 13251 + }, + { + "epoch": 0.13252, + "grad_norm": 0.731938249235553, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 13252 + }, + { + "epoch": 0.13253, + "grad_norm": 0.6513251744498714, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 13253 + }, + { + "epoch": 0.13254, + "grad_norm": 0.713972326989097, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 13254 + }, + { + "epoch": 0.13255, + "grad_norm": 0.728018823629036, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 13255 + }, + { + "epoch": 0.13256, + "grad_norm": 0.7843870536109315, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 13256 + }, + { + "epoch": 0.13257, + "grad_norm": 0.8055451385935462, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 13257 + }, + { + "epoch": 0.13258, + "grad_norm": 0.7562579848398723, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 13258 + }, + { + "epoch": 0.13259, + "grad_norm": 0.8669239597629805, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 13259 + }, + { + "epoch": 0.1326, + "grad_norm": 0.9138324374423776, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 13260 + }, + { + "epoch": 0.13261, + "grad_norm": 1.0301598630577251, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 13261 + }, + { + "epoch": 0.13262, + "grad_norm": 1.0191619611685017, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 13262 + }, + { + "epoch": 0.13263, + "grad_norm": 1.190057684517433, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 13263 + }, + { + "epoch": 0.13264, + "grad_norm": 0.8701662649204158, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 13264 + }, + { + "epoch": 0.13265, + "grad_norm": 0.8490125306455042, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 13265 + }, + { + "epoch": 0.13266, + "grad_norm": 0.8607121453115351, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 13266 + }, + { + "epoch": 0.13267, + "grad_norm": 0.9724870173082061, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 13267 + }, + { + "epoch": 0.13268, + "grad_norm": 1.0567976063712055, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 13268 + }, + { + "epoch": 0.13269, + "grad_norm": 1.1196162465438617, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 13269 + }, + { + "epoch": 0.1327, + "grad_norm": 0.7152977888864376, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 13270 + }, + { + "epoch": 0.13271, + "grad_norm": 0.5952106093514813, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 13271 + }, + { + "epoch": 0.13272, + "grad_norm": 0.7761989531262784, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 13272 + }, + { + "epoch": 0.13273, + "grad_norm": 0.9037516366698034, + "learning_rate": 0.003, + "loss": 4.067, + "step": 13273 + }, + { + "epoch": 0.13274, + "grad_norm": 1.1117710770327596, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 13274 + }, + { + "epoch": 0.13275, + "grad_norm": 0.8593479491958381, + "learning_rate": 0.003, + "loss": 4.092, + "step": 13275 + }, + { + "epoch": 0.13276, + "grad_norm": 0.7862787517919937, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 13276 + }, + { + "epoch": 0.13277, + "grad_norm": 0.7995918193250531, + "learning_rate": 0.003, + "loss": 4.088, + "step": 13277 + }, + { + "epoch": 0.13278, + "grad_norm": 0.8826709339067829, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 13278 + }, + { + "epoch": 0.13279, + "grad_norm": 0.9899054575114248, + "learning_rate": 0.003, + "loss": 4.105, + "step": 13279 + }, + { + "epoch": 0.1328, + "grad_norm": 1.034194784788381, + "learning_rate": 0.003, + "loss": 4.063, + "step": 13280 + }, + { + "epoch": 0.13281, + "grad_norm": 0.9311590661929713, + "learning_rate": 0.003, + "loss": 4.088, + "step": 13281 + }, + { + "epoch": 0.13282, + "grad_norm": 0.9933747884472336, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 13282 + }, + { + "epoch": 0.13283, + "grad_norm": 0.9179990199647776, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 13283 + }, + { + "epoch": 0.13284, + "grad_norm": 0.9088146382820541, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 13284 + }, + { + "epoch": 0.13285, + "grad_norm": 0.841846082693096, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 13285 + }, + { + "epoch": 0.13286, + "grad_norm": 0.7560675081483034, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 13286 + }, + { + "epoch": 0.13287, + "grad_norm": 0.7787318757030279, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 13287 + }, + { + "epoch": 0.13288, + "grad_norm": 0.832811008255041, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 13288 + }, + { + "epoch": 0.13289, + "grad_norm": 0.9210899944282458, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 13289 + }, + { + "epoch": 0.1329, + "grad_norm": 1.0727385091488706, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 13290 + }, + { + "epoch": 0.13291, + "grad_norm": 1.0679495390104992, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 13291 + }, + { + "epoch": 0.13292, + "grad_norm": 0.8306751001853409, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 13292 + }, + { + "epoch": 0.13293, + "grad_norm": 0.7722162493002047, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 13293 + }, + { + "epoch": 0.13294, + "grad_norm": 0.8366701719709925, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 13294 + }, + { + "epoch": 0.13295, + "grad_norm": 0.765906423235061, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 13295 + }, + { + "epoch": 0.13296, + "grad_norm": 0.7109085834456828, + "learning_rate": 0.003, + "loss": 4.113, + "step": 13296 + }, + { + "epoch": 0.13297, + "grad_norm": 0.6491217065070526, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 13297 + }, + { + "epoch": 0.13298, + "grad_norm": 0.6042714833628209, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 13298 + }, + { + "epoch": 0.13299, + "grad_norm": 0.583957760179777, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 13299 + }, + { + "epoch": 0.133, + "grad_norm": 0.6678133310698912, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 13300 + }, + { + "epoch": 0.13301, + "grad_norm": 0.7783720570215592, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 13301 + }, + { + "epoch": 0.13302, + "grad_norm": 0.7787634468990381, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 13302 + }, + { + "epoch": 0.13303, + "grad_norm": 0.7032459797829061, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 13303 + }, + { + "epoch": 0.13304, + "grad_norm": 0.8343625749860474, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 13304 + }, + { + "epoch": 0.13305, + "grad_norm": 0.9545945039442477, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 13305 + }, + { + "epoch": 0.13306, + "grad_norm": 1.0194914472166352, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 13306 + }, + { + "epoch": 0.13307, + "grad_norm": 1.0114463071538164, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 13307 + }, + { + "epoch": 0.13308, + "grad_norm": 0.9032839932518736, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 13308 + }, + { + "epoch": 0.13309, + "grad_norm": 0.8370451164267789, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 13309 + }, + { + "epoch": 0.1331, + "grad_norm": 0.711444674343713, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 13310 + }, + { + "epoch": 0.13311, + "grad_norm": 0.6965854414550621, + "learning_rate": 0.003, + "loss": 4.051, + "step": 13311 + }, + { + "epoch": 0.13312, + "grad_norm": 0.7510502291152806, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 13312 + }, + { + "epoch": 0.13313, + "grad_norm": 0.7811519956382372, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 13313 + }, + { + "epoch": 0.13314, + "grad_norm": 0.9225126210582554, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 13314 + }, + { + "epoch": 0.13315, + "grad_norm": 1.019599447545897, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 13315 + }, + { + "epoch": 0.13316, + "grad_norm": 0.9755339455442736, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 13316 + }, + { + "epoch": 0.13317, + "grad_norm": 1.0768744991340515, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 13317 + }, + { + "epoch": 0.13318, + "grad_norm": 0.9731692669405603, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 13318 + }, + { + "epoch": 0.13319, + "grad_norm": 0.9778596312954543, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 13319 + }, + { + "epoch": 0.1332, + "grad_norm": 0.8251283506942848, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 13320 + }, + { + "epoch": 0.13321, + "grad_norm": 0.7626372142889066, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 13321 + }, + { + "epoch": 0.13322, + "grad_norm": 0.6777162098362033, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 13322 + }, + { + "epoch": 0.13323, + "grad_norm": 0.7758716723902352, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 13323 + }, + { + "epoch": 0.13324, + "grad_norm": 1.027747368040209, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 13324 + }, + { + "epoch": 0.13325, + "grad_norm": 1.0318317724565758, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 13325 + }, + { + "epoch": 0.13326, + "grad_norm": 0.9787207770477119, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 13326 + }, + { + "epoch": 0.13327, + "grad_norm": 1.1574350505554019, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 13327 + }, + { + "epoch": 0.13328, + "grad_norm": 0.8247833451234012, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 13328 + }, + { + "epoch": 0.13329, + "grad_norm": 0.7027475649475333, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 13329 + }, + { + "epoch": 0.1333, + "grad_norm": 0.6976677720156358, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 13330 + }, + { + "epoch": 0.13331, + "grad_norm": 0.6676022161383816, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 13331 + }, + { + "epoch": 0.13332, + "grad_norm": 0.7003501667646581, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 13332 + }, + { + "epoch": 0.13333, + "grad_norm": 0.731020837366609, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 13333 + }, + { + "epoch": 0.13334, + "grad_norm": 0.8051657766869936, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 13334 + }, + { + "epoch": 0.13335, + "grad_norm": 0.7729201067377107, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 13335 + }, + { + "epoch": 0.13336, + "grad_norm": 0.8503074795049055, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 13336 + }, + { + "epoch": 0.13337, + "grad_norm": 0.8750344965338823, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 13337 + }, + { + "epoch": 0.13338, + "grad_norm": 0.8004965448174175, + "learning_rate": 0.003, + "loss": 4.1146, + "step": 13338 + }, + { + "epoch": 0.13339, + "grad_norm": 0.8382574319131801, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 13339 + }, + { + "epoch": 0.1334, + "grad_norm": 0.8946647901187211, + "learning_rate": 0.003, + "loss": 4.066, + "step": 13340 + }, + { + "epoch": 0.13341, + "grad_norm": 0.8920149557750658, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 13341 + }, + { + "epoch": 0.13342, + "grad_norm": 0.9319479396827326, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 13342 + }, + { + "epoch": 0.13343, + "grad_norm": 1.1367942821984693, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 13343 + }, + { + "epoch": 0.13344, + "grad_norm": 1.2276221027203078, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 13344 + }, + { + "epoch": 0.13345, + "grad_norm": 0.7607096301855646, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 13345 + }, + { + "epoch": 0.13346, + "grad_norm": 0.6825358870284136, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 13346 + }, + { + "epoch": 0.13347, + "grad_norm": 0.7571488375875286, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 13347 + }, + { + "epoch": 0.13348, + "grad_norm": 0.7154570176565842, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 13348 + }, + { + "epoch": 0.13349, + "grad_norm": 0.7812634900646294, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 13349 + }, + { + "epoch": 0.1335, + "grad_norm": 0.8282451356725966, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 13350 + }, + { + "epoch": 0.13351, + "grad_norm": 0.8979240341739255, + "learning_rate": 0.003, + "loss": 4.085, + "step": 13351 + }, + { + "epoch": 0.13352, + "grad_norm": 0.9390285109647729, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 13352 + }, + { + "epoch": 0.13353, + "grad_norm": 0.9202136357231417, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 13353 + }, + { + "epoch": 0.13354, + "grad_norm": 0.9058592985531205, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 13354 + }, + { + "epoch": 0.13355, + "grad_norm": 0.8557205833569185, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 13355 + }, + { + "epoch": 0.13356, + "grad_norm": 0.7528300927078995, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 13356 + }, + { + "epoch": 0.13357, + "grad_norm": 0.662604417061924, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 13357 + }, + { + "epoch": 0.13358, + "grad_norm": 0.7000789974285319, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 13358 + }, + { + "epoch": 0.13359, + "grad_norm": 0.7622887648402079, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 13359 + }, + { + "epoch": 0.1336, + "grad_norm": 0.817243636191248, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 13360 + }, + { + "epoch": 0.13361, + "grad_norm": 0.8562119574046776, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 13361 + }, + { + "epoch": 0.13362, + "grad_norm": 1.010234313955433, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 13362 + }, + { + "epoch": 0.13363, + "grad_norm": 1.1214538024604714, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 13363 + }, + { + "epoch": 0.13364, + "grad_norm": 0.9007334539006093, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 13364 + }, + { + "epoch": 0.13365, + "grad_norm": 0.782430415406902, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 13365 + }, + { + "epoch": 0.13366, + "grad_norm": 0.6656368480111795, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 13366 + }, + { + "epoch": 0.13367, + "grad_norm": 0.7581005396936631, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 13367 + }, + { + "epoch": 0.13368, + "grad_norm": 0.8526986605439665, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 13368 + }, + { + "epoch": 0.13369, + "grad_norm": 1.07039059656977, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 13369 + }, + { + "epoch": 0.1337, + "grad_norm": 1.167795834790775, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 13370 + }, + { + "epoch": 0.13371, + "grad_norm": 0.7673312662291831, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 13371 + }, + { + "epoch": 0.13372, + "grad_norm": 0.7018354684911177, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 13372 + }, + { + "epoch": 0.13373, + "grad_norm": 0.6618742873530599, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 13373 + }, + { + "epoch": 0.13374, + "grad_norm": 0.7461046736923796, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 13374 + }, + { + "epoch": 0.13375, + "grad_norm": 0.8093240859237089, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 13375 + }, + { + "epoch": 0.13376, + "grad_norm": 0.8188481304026195, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 13376 + }, + { + "epoch": 0.13377, + "grad_norm": 0.8843136314523814, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 13377 + }, + { + "epoch": 0.13378, + "grad_norm": 0.9555559955370252, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 13378 + }, + { + "epoch": 0.13379, + "grad_norm": 0.997572197642538, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 13379 + }, + { + "epoch": 0.1338, + "grad_norm": 1.0855199724802314, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 13380 + }, + { + "epoch": 0.13381, + "grad_norm": 0.9330965514895911, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 13381 + }, + { + "epoch": 0.13382, + "grad_norm": 0.993615014824088, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 13382 + }, + { + "epoch": 0.13383, + "grad_norm": 1.06876028029054, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 13383 + }, + { + "epoch": 0.13384, + "grad_norm": 0.9544246482417755, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 13384 + }, + { + "epoch": 0.13385, + "grad_norm": 0.9293864018413087, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 13385 + }, + { + "epoch": 0.13386, + "grad_norm": 0.96226321750807, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 13386 + }, + { + "epoch": 0.13387, + "grad_norm": 0.946212009523684, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 13387 + }, + { + "epoch": 0.13388, + "grad_norm": 0.806514044512673, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 13388 + }, + { + "epoch": 0.13389, + "grad_norm": 0.8000459375315871, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 13389 + }, + { + "epoch": 0.1339, + "grad_norm": 0.7366439724037857, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 13390 + }, + { + "epoch": 0.13391, + "grad_norm": 0.8181400796841161, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 13391 + }, + { + "epoch": 0.13392, + "grad_norm": 0.999696214534662, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 13392 + }, + { + "epoch": 0.13393, + "grad_norm": 1.2717687360989915, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 13393 + }, + { + "epoch": 0.13394, + "grad_norm": 0.6855942854787728, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 13394 + }, + { + "epoch": 0.13395, + "grad_norm": 0.6210133008885812, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 13395 + }, + { + "epoch": 0.13396, + "grad_norm": 0.742261110987861, + "learning_rate": 0.003, + "loss": 4.062, + "step": 13396 + }, + { + "epoch": 0.13397, + "grad_norm": 0.7906941965969082, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 13397 + }, + { + "epoch": 0.13398, + "grad_norm": 0.8269568020833963, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 13398 + }, + { + "epoch": 0.13399, + "grad_norm": 0.8887188100677542, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 13399 + }, + { + "epoch": 0.134, + "grad_norm": 0.9132823458032262, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 13400 + }, + { + "epoch": 0.13401, + "grad_norm": 0.8876835827299703, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 13401 + }, + { + "epoch": 0.13402, + "grad_norm": 0.9344253021723582, + "learning_rate": 0.003, + "loss": 4.033, + "step": 13402 + }, + { + "epoch": 0.13403, + "grad_norm": 1.1870397445205836, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 13403 + }, + { + "epoch": 0.13404, + "grad_norm": 0.8520339969930502, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 13404 + }, + { + "epoch": 0.13405, + "grad_norm": 0.7530756929638318, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 13405 + }, + { + "epoch": 0.13406, + "grad_norm": 0.7291047875714687, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 13406 + }, + { + "epoch": 0.13407, + "grad_norm": 0.8192456380386125, + "learning_rate": 0.003, + "loss": 4.083, + "step": 13407 + }, + { + "epoch": 0.13408, + "grad_norm": 1.0734420527156827, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 13408 + }, + { + "epoch": 0.13409, + "grad_norm": 1.1175490283853078, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 13409 + }, + { + "epoch": 0.1341, + "grad_norm": 0.7573190993520491, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 13410 + }, + { + "epoch": 0.13411, + "grad_norm": 0.5935748761477859, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 13411 + }, + { + "epoch": 0.13412, + "grad_norm": 0.6489974564660388, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 13412 + }, + { + "epoch": 0.13413, + "grad_norm": 0.7229439519999141, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 13413 + }, + { + "epoch": 0.13414, + "grad_norm": 0.7689091743956452, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 13414 + }, + { + "epoch": 0.13415, + "grad_norm": 0.7754417224407258, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 13415 + }, + { + "epoch": 0.13416, + "grad_norm": 0.6745797484626426, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 13416 + }, + { + "epoch": 0.13417, + "grad_norm": 0.8217901459147499, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 13417 + }, + { + "epoch": 0.13418, + "grad_norm": 0.9182152387918147, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 13418 + }, + { + "epoch": 0.13419, + "grad_norm": 0.9469773874113262, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 13419 + }, + { + "epoch": 0.1342, + "grad_norm": 1.1086457175612325, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 13420 + }, + { + "epoch": 0.13421, + "grad_norm": 0.9384061110337768, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 13421 + }, + { + "epoch": 0.13422, + "grad_norm": 0.954951689668296, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 13422 + }, + { + "epoch": 0.13423, + "grad_norm": 1.052374563535259, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 13423 + }, + { + "epoch": 0.13424, + "grad_norm": 0.9805125849208332, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 13424 + }, + { + "epoch": 0.13425, + "grad_norm": 0.8483129123086609, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 13425 + }, + { + "epoch": 0.13426, + "grad_norm": 0.7966598067261172, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 13426 + }, + { + "epoch": 0.13427, + "grad_norm": 0.8762148637542061, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 13427 + }, + { + "epoch": 0.13428, + "grad_norm": 0.9094155326975562, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 13428 + }, + { + "epoch": 0.13429, + "grad_norm": 0.9663884563079317, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 13429 + }, + { + "epoch": 0.1343, + "grad_norm": 1.1211345291455057, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 13430 + }, + { + "epoch": 0.13431, + "grad_norm": 1.0140114717752684, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 13431 + }, + { + "epoch": 0.13432, + "grad_norm": 0.8217997658187381, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 13432 + }, + { + "epoch": 0.13433, + "grad_norm": 0.7248351378919788, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 13433 + }, + { + "epoch": 0.13434, + "grad_norm": 0.6280610647430119, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 13434 + }, + { + "epoch": 0.13435, + "grad_norm": 0.6075072218051248, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 13435 + }, + { + "epoch": 0.13436, + "grad_norm": 0.6456207752877368, + "learning_rate": 0.003, + "loss": 4.088, + "step": 13436 + }, + { + "epoch": 0.13437, + "grad_norm": 0.6145605351226269, + "learning_rate": 0.003, + "loss": 4.062, + "step": 13437 + }, + { + "epoch": 0.13438, + "grad_norm": 0.5517181867891378, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 13438 + }, + { + "epoch": 0.13439, + "grad_norm": 0.6578844697806577, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 13439 + }, + { + "epoch": 0.1344, + "grad_norm": 0.6665378345671775, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 13440 + }, + { + "epoch": 0.13441, + "grad_norm": 0.6418518042646504, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 13441 + }, + { + "epoch": 0.13442, + "grad_norm": 0.6843940993097485, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 13442 + }, + { + "epoch": 0.13443, + "grad_norm": 0.8707193985221511, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 13443 + }, + { + "epoch": 0.13444, + "grad_norm": 0.9847548524745664, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 13444 + }, + { + "epoch": 0.13445, + "grad_norm": 1.128845903324881, + "learning_rate": 0.003, + "loss": 4.074, + "step": 13445 + }, + { + "epoch": 0.13446, + "grad_norm": 1.2068093097783414, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 13446 + }, + { + "epoch": 0.13447, + "grad_norm": 0.7287091593638473, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 13447 + }, + { + "epoch": 0.13448, + "grad_norm": 0.5785864530858607, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 13448 + }, + { + "epoch": 0.13449, + "grad_norm": 0.578733738934119, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 13449 + }, + { + "epoch": 0.1345, + "grad_norm": 0.5265843405909042, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 13450 + }, + { + "epoch": 0.13451, + "grad_norm": 0.5563921477003801, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 13451 + }, + { + "epoch": 0.13452, + "grad_norm": 0.6096950372974864, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 13452 + }, + { + "epoch": 0.13453, + "grad_norm": 0.7508132198296539, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 13453 + }, + { + "epoch": 0.13454, + "grad_norm": 0.7993193243181995, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 13454 + }, + { + "epoch": 0.13455, + "grad_norm": 0.7682487969894645, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 13455 + }, + { + "epoch": 0.13456, + "grad_norm": 0.8796777200899208, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 13456 + }, + { + "epoch": 0.13457, + "grad_norm": 1.1469397610089969, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 13457 + }, + { + "epoch": 0.13458, + "grad_norm": 1.146202488625937, + "learning_rate": 0.003, + "loss": 4.101, + "step": 13458 + }, + { + "epoch": 0.13459, + "grad_norm": 0.8963160468442611, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 13459 + }, + { + "epoch": 0.1346, + "grad_norm": 0.8324177540997719, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 13460 + }, + { + "epoch": 0.13461, + "grad_norm": 0.8768079654245492, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 13461 + }, + { + "epoch": 0.13462, + "grad_norm": 0.8814361611365389, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 13462 + }, + { + "epoch": 0.13463, + "grad_norm": 0.8304217889455825, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 13463 + }, + { + "epoch": 0.13464, + "grad_norm": 0.7849247804892645, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 13464 + }, + { + "epoch": 0.13465, + "grad_norm": 1.0067140334691227, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 13465 + }, + { + "epoch": 0.13466, + "grad_norm": 1.0827853047168359, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 13466 + }, + { + "epoch": 0.13467, + "grad_norm": 0.9863994122771002, + "learning_rate": 0.003, + "loss": 4.109, + "step": 13467 + }, + { + "epoch": 0.13468, + "grad_norm": 1.1519198315158714, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 13468 + }, + { + "epoch": 0.13469, + "grad_norm": 0.9094830470535434, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 13469 + }, + { + "epoch": 0.1347, + "grad_norm": 0.9615241547488029, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 13470 + }, + { + "epoch": 0.13471, + "grad_norm": 1.0026615449923861, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 13471 + }, + { + "epoch": 0.13472, + "grad_norm": 0.8963248151652958, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 13472 + }, + { + "epoch": 0.13473, + "grad_norm": 0.8388906368889392, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 13473 + }, + { + "epoch": 0.13474, + "grad_norm": 0.8659010536701638, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 13474 + }, + { + "epoch": 0.13475, + "grad_norm": 1.1420323194450308, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 13475 + }, + { + "epoch": 0.13476, + "grad_norm": 1.1189402465759628, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 13476 + }, + { + "epoch": 0.13477, + "grad_norm": 0.9838216383823422, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 13477 + }, + { + "epoch": 0.13478, + "grad_norm": 0.9836383431393855, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 13478 + }, + { + "epoch": 0.13479, + "grad_norm": 0.9535032512085219, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 13479 + }, + { + "epoch": 0.1348, + "grad_norm": 1.0082703455759217, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 13480 + }, + { + "epoch": 0.13481, + "grad_norm": 0.9423372318082879, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 13481 + }, + { + "epoch": 0.13482, + "grad_norm": 0.737918943866043, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 13482 + }, + { + "epoch": 0.13483, + "grad_norm": 0.696892101834753, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 13483 + }, + { + "epoch": 0.13484, + "grad_norm": 0.6846874888722756, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 13484 + }, + { + "epoch": 0.13485, + "grad_norm": 0.7041497552775802, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 13485 + }, + { + "epoch": 0.13486, + "grad_norm": 0.7623820637051687, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 13486 + }, + { + "epoch": 0.13487, + "grad_norm": 0.8490472011425173, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 13487 + }, + { + "epoch": 0.13488, + "grad_norm": 0.9612767459994789, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 13488 + }, + { + "epoch": 0.13489, + "grad_norm": 0.9657936696913836, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 13489 + }, + { + "epoch": 0.1349, + "grad_norm": 0.9130234524107858, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 13490 + }, + { + "epoch": 0.13491, + "grad_norm": 0.8790648583629521, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 13491 + }, + { + "epoch": 0.13492, + "grad_norm": 0.9532097034274005, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 13492 + }, + { + "epoch": 0.13493, + "grad_norm": 1.0608055718960732, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 13493 + }, + { + "epoch": 0.13494, + "grad_norm": 1.3057994901362042, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 13494 + }, + { + "epoch": 0.13495, + "grad_norm": 0.7033861599216819, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 13495 + }, + { + "epoch": 0.13496, + "grad_norm": 0.6617324852095671, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 13496 + }, + { + "epoch": 0.13497, + "grad_norm": 0.7023942775756882, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 13497 + }, + { + "epoch": 0.13498, + "grad_norm": 0.669109474440314, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 13498 + }, + { + "epoch": 0.13499, + "grad_norm": 0.6316320567905482, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 13499 + }, + { + "epoch": 0.135, + "grad_norm": 0.6942712253269201, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 13500 + }, + { + "epoch": 0.13501, + "grad_norm": 0.803069854384175, + "learning_rate": 0.003, + "loss": 4.092, + "step": 13501 + }, + { + "epoch": 0.13502, + "grad_norm": 0.869373680749491, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 13502 + }, + { + "epoch": 0.13503, + "grad_norm": 0.9541104536726139, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 13503 + }, + { + "epoch": 0.13504, + "grad_norm": 1.0188968412109567, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 13504 + }, + { + "epoch": 0.13505, + "grad_norm": 1.0129334158742416, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 13505 + }, + { + "epoch": 0.13506, + "grad_norm": 0.9867883103916336, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 13506 + }, + { + "epoch": 0.13507, + "grad_norm": 0.8006738248554235, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 13507 + }, + { + "epoch": 0.13508, + "grad_norm": 0.756787490611719, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 13508 + }, + { + "epoch": 0.13509, + "grad_norm": 0.7731031670930801, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 13509 + }, + { + "epoch": 0.1351, + "grad_norm": 0.8686810584219077, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 13510 + }, + { + "epoch": 0.13511, + "grad_norm": 0.8508271823239518, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 13511 + }, + { + "epoch": 0.13512, + "grad_norm": 0.8478064096923771, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 13512 + }, + { + "epoch": 0.13513, + "grad_norm": 0.7839227854594315, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 13513 + }, + { + "epoch": 0.13514, + "grad_norm": 0.7301078450648706, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 13514 + }, + { + "epoch": 0.13515, + "grad_norm": 0.7336051058766169, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 13515 + }, + { + "epoch": 0.13516, + "grad_norm": 0.7245613565259936, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 13516 + }, + { + "epoch": 0.13517, + "grad_norm": 0.8211092942650285, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 13517 + }, + { + "epoch": 0.13518, + "grad_norm": 0.7879538257024276, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 13518 + }, + { + "epoch": 0.13519, + "grad_norm": 0.9148074094319852, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 13519 + }, + { + "epoch": 0.1352, + "grad_norm": 1.0521107275543449, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 13520 + }, + { + "epoch": 0.13521, + "grad_norm": 1.1302849728490965, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 13521 + }, + { + "epoch": 0.13522, + "grad_norm": 0.9697364533787071, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 13522 + }, + { + "epoch": 0.13523, + "grad_norm": 1.0552404851435344, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 13523 + }, + { + "epoch": 0.13524, + "grad_norm": 0.9744665347810808, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 13524 + }, + { + "epoch": 0.13525, + "grad_norm": 0.9272379642376642, + "learning_rate": 0.003, + "loss": 4.082, + "step": 13525 + }, + { + "epoch": 0.13526, + "grad_norm": 0.9557207161887343, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 13526 + }, + { + "epoch": 0.13527, + "grad_norm": 0.9293390684670042, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 13527 + }, + { + "epoch": 0.13528, + "grad_norm": 1.0052828200881132, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 13528 + }, + { + "epoch": 0.13529, + "grad_norm": 1.0371311360910564, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 13529 + }, + { + "epoch": 0.1353, + "grad_norm": 1.0126977222126474, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 13530 + }, + { + "epoch": 0.13531, + "grad_norm": 0.7933620857620177, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 13531 + }, + { + "epoch": 0.13532, + "grad_norm": 0.7219190481703145, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 13532 + }, + { + "epoch": 0.13533, + "grad_norm": 0.7990634460705841, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 13533 + }, + { + "epoch": 0.13534, + "grad_norm": 0.9732672531781391, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 13534 + }, + { + "epoch": 0.13535, + "grad_norm": 1.1699596888991803, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 13535 + }, + { + "epoch": 0.13536, + "grad_norm": 0.8445467016318637, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 13536 + }, + { + "epoch": 0.13537, + "grad_norm": 0.756263504466685, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 13537 + }, + { + "epoch": 0.13538, + "grad_norm": 0.7757852409412948, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 13538 + }, + { + "epoch": 0.13539, + "grad_norm": 0.7805717985227083, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 13539 + }, + { + "epoch": 0.1354, + "grad_norm": 0.7861378914606122, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 13540 + }, + { + "epoch": 0.13541, + "grad_norm": 0.7840620668567959, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 13541 + }, + { + "epoch": 0.13542, + "grad_norm": 0.7729986654803618, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 13542 + }, + { + "epoch": 0.13543, + "grad_norm": 0.659090066702033, + "learning_rate": 0.003, + "loss": 4.06, + "step": 13543 + }, + { + "epoch": 0.13544, + "grad_norm": 0.703870682999343, + "learning_rate": 0.003, + "loss": 4.078, + "step": 13544 + }, + { + "epoch": 0.13545, + "grad_norm": 0.6759376770950815, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 13545 + }, + { + "epoch": 0.13546, + "grad_norm": 0.7681361437877832, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 13546 + }, + { + "epoch": 0.13547, + "grad_norm": 0.7985128232105765, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 13547 + }, + { + "epoch": 0.13548, + "grad_norm": 0.7456046344594788, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 13548 + }, + { + "epoch": 0.13549, + "grad_norm": 0.8366654808188815, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 13549 + }, + { + "epoch": 0.1355, + "grad_norm": 1.0109224036728328, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 13550 + }, + { + "epoch": 0.13551, + "grad_norm": 1.109188580324219, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 13551 + }, + { + "epoch": 0.13552, + "grad_norm": 0.9710169157720988, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 13552 + }, + { + "epoch": 0.13553, + "grad_norm": 0.8362989825847552, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 13553 + }, + { + "epoch": 0.13554, + "grad_norm": 0.7901306253623673, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 13554 + }, + { + "epoch": 0.13555, + "grad_norm": 0.8240789603855017, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 13555 + }, + { + "epoch": 0.13556, + "grad_norm": 0.8048037134669658, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 13556 + }, + { + "epoch": 0.13557, + "grad_norm": 0.8028701465113719, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 13557 + }, + { + "epoch": 0.13558, + "grad_norm": 0.833805799784846, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 13558 + }, + { + "epoch": 0.13559, + "grad_norm": 0.9000878822488261, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 13559 + }, + { + "epoch": 0.1356, + "grad_norm": 0.9092498325799294, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 13560 + }, + { + "epoch": 0.13561, + "grad_norm": 1.089421266137573, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 13561 + }, + { + "epoch": 0.13562, + "grad_norm": 1.1912402355735798, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 13562 + }, + { + "epoch": 0.13563, + "grad_norm": 0.8159061983358631, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 13563 + }, + { + "epoch": 0.13564, + "grad_norm": 0.6140979932194808, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 13564 + }, + { + "epoch": 0.13565, + "grad_norm": 0.5625859668192599, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 13565 + }, + { + "epoch": 0.13566, + "grad_norm": 0.6050358428049795, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 13566 + }, + { + "epoch": 0.13567, + "grad_norm": 0.6868758702178857, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 13567 + }, + { + "epoch": 0.13568, + "grad_norm": 0.7205192721231606, + "learning_rate": 0.003, + "loss": 4.063, + "step": 13568 + }, + { + "epoch": 0.13569, + "grad_norm": 0.7278685945933725, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 13569 + }, + { + "epoch": 0.1357, + "grad_norm": 0.6764390908788951, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 13570 + }, + { + "epoch": 0.13571, + "grad_norm": 0.6973655855305073, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 13571 + }, + { + "epoch": 0.13572, + "grad_norm": 0.7012650191061759, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 13572 + }, + { + "epoch": 0.13573, + "grad_norm": 0.8050491163634, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 13573 + }, + { + "epoch": 0.13574, + "grad_norm": 0.9743995420341659, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 13574 + }, + { + "epoch": 0.13575, + "grad_norm": 1.2222001451313191, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 13575 + }, + { + "epoch": 0.13576, + "grad_norm": 0.7012402180003754, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 13576 + }, + { + "epoch": 0.13577, + "grad_norm": 0.753848322169635, + "learning_rate": 0.003, + "loss": 4.077, + "step": 13577 + }, + { + "epoch": 0.13578, + "grad_norm": 1.0667244804526879, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 13578 + }, + { + "epoch": 0.13579, + "grad_norm": 0.9979644252700127, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 13579 + }, + { + "epoch": 0.1358, + "grad_norm": 0.8636628689836429, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 13580 + }, + { + "epoch": 0.13581, + "grad_norm": 0.8122661263515126, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 13581 + }, + { + "epoch": 0.13582, + "grad_norm": 0.7823422529780943, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 13582 + }, + { + "epoch": 0.13583, + "grad_norm": 0.8973264048047405, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 13583 + }, + { + "epoch": 0.13584, + "grad_norm": 1.0198634801706925, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 13584 + }, + { + "epoch": 0.13585, + "grad_norm": 0.9758609896953625, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 13585 + }, + { + "epoch": 0.13586, + "grad_norm": 0.9574803853337032, + "learning_rate": 0.003, + "loss": 4.1225, + "step": 13586 + }, + { + "epoch": 0.13587, + "grad_norm": 1.0153232515470276, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 13587 + }, + { + "epoch": 0.13588, + "grad_norm": 1.1243945262891535, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 13588 + }, + { + "epoch": 0.13589, + "grad_norm": 1.0555024684934773, + "learning_rate": 0.003, + "loss": 4.092, + "step": 13589 + }, + { + "epoch": 0.1359, + "grad_norm": 0.9386569712532187, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 13590 + }, + { + "epoch": 0.13591, + "grad_norm": 0.9423432065056779, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 13591 + }, + { + "epoch": 0.13592, + "grad_norm": 0.9570236944619993, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 13592 + }, + { + "epoch": 0.13593, + "grad_norm": 0.9994032976286987, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 13593 + }, + { + "epoch": 0.13594, + "grad_norm": 1.1293489136713144, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 13594 + }, + { + "epoch": 0.13595, + "grad_norm": 0.8770596374307061, + "learning_rate": 0.003, + "loss": 4.089, + "step": 13595 + }, + { + "epoch": 0.13596, + "grad_norm": 0.9540307039164383, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 13596 + }, + { + "epoch": 0.13597, + "grad_norm": 1.0348464305881202, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 13597 + }, + { + "epoch": 0.13598, + "grad_norm": 0.7952445402465652, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 13598 + }, + { + "epoch": 0.13599, + "grad_norm": 0.838767728060577, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 13599 + }, + { + "epoch": 0.136, + "grad_norm": 0.7859214529580825, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 13600 + }, + { + "epoch": 0.13601, + "grad_norm": 0.8664034168727403, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 13601 + }, + { + "epoch": 0.13602, + "grad_norm": 1.05749460632109, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 13602 + }, + { + "epoch": 0.13603, + "grad_norm": 0.9115800757473246, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 13603 + }, + { + "epoch": 0.13604, + "grad_norm": 1.051125147838006, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 13604 + }, + { + "epoch": 0.13605, + "grad_norm": 0.9750073294288395, + "learning_rate": 0.003, + "loss": 4.038, + "step": 13605 + }, + { + "epoch": 0.13606, + "grad_norm": 0.8980573196185354, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 13606 + }, + { + "epoch": 0.13607, + "grad_norm": 0.8747864491235297, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 13607 + }, + { + "epoch": 0.13608, + "grad_norm": 0.9574554287453303, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 13608 + }, + { + "epoch": 0.13609, + "grad_norm": 0.9258162718847411, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 13609 + }, + { + "epoch": 0.1361, + "grad_norm": 0.7573297562448439, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 13610 + }, + { + "epoch": 0.13611, + "grad_norm": 0.6582701023818656, + "learning_rate": 0.003, + "loss": 4.066, + "step": 13611 + }, + { + "epoch": 0.13612, + "grad_norm": 0.6137564095547992, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 13612 + }, + { + "epoch": 0.13613, + "grad_norm": 0.6906795147923297, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 13613 + }, + { + "epoch": 0.13614, + "grad_norm": 0.7317210056279667, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 13614 + }, + { + "epoch": 0.13615, + "grad_norm": 0.7973083599241934, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 13615 + }, + { + "epoch": 0.13616, + "grad_norm": 0.8122996454877814, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 13616 + }, + { + "epoch": 0.13617, + "grad_norm": 0.6800092785282158, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 13617 + }, + { + "epoch": 0.13618, + "grad_norm": 0.7395444303121143, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 13618 + }, + { + "epoch": 0.13619, + "grad_norm": 0.8601458402892523, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 13619 + }, + { + "epoch": 0.1362, + "grad_norm": 0.7866587914691752, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 13620 + }, + { + "epoch": 0.13621, + "grad_norm": 0.6894813065274821, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 13621 + }, + { + "epoch": 0.13622, + "grad_norm": 0.7751245638506891, + "learning_rate": 0.003, + "loss": 4.069, + "step": 13622 + }, + { + "epoch": 0.13623, + "grad_norm": 0.7811576890318376, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 13623 + }, + { + "epoch": 0.13624, + "grad_norm": 1.0791880392784912, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 13624 + }, + { + "epoch": 0.13625, + "grad_norm": 1.1661365389411822, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 13625 + }, + { + "epoch": 0.13626, + "grad_norm": 0.8310605669671869, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 13626 + }, + { + "epoch": 0.13627, + "grad_norm": 0.8116350463802214, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 13627 + }, + { + "epoch": 0.13628, + "grad_norm": 0.732431590284875, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 13628 + }, + { + "epoch": 0.13629, + "grad_norm": 0.7708504377596485, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 13629 + }, + { + "epoch": 0.1363, + "grad_norm": 0.8370934469455775, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 13630 + }, + { + "epoch": 0.13631, + "grad_norm": 0.8721551278857689, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 13631 + }, + { + "epoch": 0.13632, + "grad_norm": 0.8001208046961691, + "learning_rate": 0.003, + "loss": 4.096, + "step": 13632 + }, + { + "epoch": 0.13633, + "grad_norm": 0.6855495881523573, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 13633 + }, + { + "epoch": 0.13634, + "grad_norm": 0.7438299753527488, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 13634 + }, + { + "epoch": 0.13635, + "grad_norm": 0.7952533897916455, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 13635 + }, + { + "epoch": 0.13636, + "grad_norm": 0.8363194500609099, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 13636 + }, + { + "epoch": 0.13637, + "grad_norm": 0.7601839652731163, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 13637 + }, + { + "epoch": 0.13638, + "grad_norm": 0.6780506683164645, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 13638 + }, + { + "epoch": 0.13639, + "grad_norm": 0.683180007369749, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 13639 + }, + { + "epoch": 0.1364, + "grad_norm": 0.6438945473416481, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 13640 + }, + { + "epoch": 0.13641, + "grad_norm": 0.6499667187267382, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 13641 + }, + { + "epoch": 0.13642, + "grad_norm": 0.6850170351662737, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 13642 + }, + { + "epoch": 0.13643, + "grad_norm": 0.8136727294891523, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 13643 + }, + { + "epoch": 0.13644, + "grad_norm": 1.2156858921156055, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 13644 + }, + { + "epoch": 0.13645, + "grad_norm": 1.0436887520306182, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 13645 + }, + { + "epoch": 0.13646, + "grad_norm": 0.8449363069264667, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 13646 + }, + { + "epoch": 0.13647, + "grad_norm": 0.850875163767057, + "learning_rate": 0.003, + "loss": 4.077, + "step": 13647 + }, + { + "epoch": 0.13648, + "grad_norm": 0.95375218441901, + "learning_rate": 0.003, + "loss": 4.041, + "step": 13648 + }, + { + "epoch": 0.13649, + "grad_norm": 1.0050975309029315, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 13649 + }, + { + "epoch": 0.1365, + "grad_norm": 0.9843391727985866, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 13650 + }, + { + "epoch": 0.13651, + "grad_norm": 0.9427451081512389, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 13651 + }, + { + "epoch": 0.13652, + "grad_norm": 0.9763292838665173, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 13652 + }, + { + "epoch": 0.13653, + "grad_norm": 0.8634704292443027, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 13653 + }, + { + "epoch": 0.13654, + "grad_norm": 0.9868341046720903, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 13654 + }, + { + "epoch": 0.13655, + "grad_norm": 1.0639061905588236, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 13655 + }, + { + "epoch": 0.13656, + "grad_norm": 1.091557124889247, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 13656 + }, + { + "epoch": 0.13657, + "grad_norm": 0.9231903366512457, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 13657 + }, + { + "epoch": 0.13658, + "grad_norm": 0.8486992017924497, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 13658 + }, + { + "epoch": 0.13659, + "grad_norm": 0.7664700162395804, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 13659 + }, + { + "epoch": 0.1366, + "grad_norm": 0.7065379273868619, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 13660 + }, + { + "epoch": 0.13661, + "grad_norm": 0.7256696633312764, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 13661 + }, + { + "epoch": 0.13662, + "grad_norm": 0.7690012211967215, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 13662 + }, + { + "epoch": 0.13663, + "grad_norm": 0.9283692376422882, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 13663 + }, + { + "epoch": 0.13664, + "grad_norm": 1.013928354570009, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 13664 + }, + { + "epoch": 0.13665, + "grad_norm": 1.060865621299839, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 13665 + }, + { + "epoch": 0.13666, + "grad_norm": 0.9465780308512567, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 13666 + }, + { + "epoch": 0.13667, + "grad_norm": 0.9670648703906136, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 13667 + }, + { + "epoch": 0.13668, + "grad_norm": 0.8473843407878847, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 13668 + }, + { + "epoch": 0.13669, + "grad_norm": 0.7314897721635479, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 13669 + }, + { + "epoch": 0.1367, + "grad_norm": 0.7531751860487068, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 13670 + }, + { + "epoch": 0.13671, + "grad_norm": 0.8112896300846317, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 13671 + }, + { + "epoch": 0.13672, + "grad_norm": 0.8532693255583029, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 13672 + }, + { + "epoch": 0.13673, + "grad_norm": 0.9427863519402445, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 13673 + }, + { + "epoch": 0.13674, + "grad_norm": 1.084580485879212, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 13674 + }, + { + "epoch": 0.13675, + "grad_norm": 0.8723766795237089, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 13675 + }, + { + "epoch": 0.13676, + "grad_norm": 0.8475241093068473, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 13676 + }, + { + "epoch": 0.13677, + "grad_norm": 0.9196363658028469, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 13677 + }, + { + "epoch": 0.13678, + "grad_norm": 0.9787266625530816, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 13678 + }, + { + "epoch": 0.13679, + "grad_norm": 0.9010503311425638, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 13679 + }, + { + "epoch": 0.1368, + "grad_norm": 0.7852953043656846, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 13680 + }, + { + "epoch": 0.13681, + "grad_norm": 0.803772106831219, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 13681 + }, + { + "epoch": 0.13682, + "grad_norm": 0.7772930432252959, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 13682 + }, + { + "epoch": 0.13683, + "grad_norm": 0.8199857952108855, + "learning_rate": 0.003, + "loss": 4.073, + "step": 13683 + }, + { + "epoch": 0.13684, + "grad_norm": 0.8137760939512062, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 13684 + }, + { + "epoch": 0.13685, + "grad_norm": 0.8146180859156968, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 13685 + }, + { + "epoch": 0.13686, + "grad_norm": 0.921238466382573, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 13686 + }, + { + "epoch": 0.13687, + "grad_norm": 0.9444575147851021, + "learning_rate": 0.003, + "loss": 4.068, + "step": 13687 + }, + { + "epoch": 0.13688, + "grad_norm": 1.0487047341776896, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 13688 + }, + { + "epoch": 0.13689, + "grad_norm": 1.0075912491629209, + "learning_rate": 0.003, + "loss": 4.098, + "step": 13689 + }, + { + "epoch": 0.1369, + "grad_norm": 0.870940565134766, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 13690 + }, + { + "epoch": 0.13691, + "grad_norm": 0.9240675323645391, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 13691 + }, + { + "epoch": 0.13692, + "grad_norm": 0.9599669386817655, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 13692 + }, + { + "epoch": 0.13693, + "grad_norm": 0.9415637855056248, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 13693 + }, + { + "epoch": 0.13694, + "grad_norm": 0.9501177605434156, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 13694 + }, + { + "epoch": 0.13695, + "grad_norm": 0.8100198531867937, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 13695 + }, + { + "epoch": 0.13696, + "grad_norm": 0.6679951906896466, + "learning_rate": 0.003, + "loss": 4.08, + "step": 13696 + }, + { + "epoch": 0.13697, + "grad_norm": 0.6921532768967519, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 13697 + }, + { + "epoch": 0.13698, + "grad_norm": 0.7470812239613058, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 13698 + }, + { + "epoch": 0.13699, + "grad_norm": 0.9108484007535782, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 13699 + }, + { + "epoch": 0.137, + "grad_norm": 1.1447311238807532, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 13700 + }, + { + "epoch": 0.13701, + "grad_norm": 0.937413956264893, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 13701 + }, + { + "epoch": 0.13702, + "grad_norm": 0.8178232351099999, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 13702 + }, + { + "epoch": 0.13703, + "grad_norm": 0.7017352086026089, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 13703 + }, + { + "epoch": 0.13704, + "grad_norm": 0.6560245541563535, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 13704 + }, + { + "epoch": 0.13705, + "grad_norm": 0.6797524992943338, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 13705 + }, + { + "epoch": 0.13706, + "grad_norm": 0.7946133213095659, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 13706 + }, + { + "epoch": 0.13707, + "grad_norm": 0.78560764436037, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 13707 + }, + { + "epoch": 0.13708, + "grad_norm": 0.8277643879380545, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 13708 + }, + { + "epoch": 0.13709, + "grad_norm": 0.9993663880079139, + "learning_rate": 0.003, + "loss": 4.065, + "step": 13709 + }, + { + "epoch": 0.1371, + "grad_norm": 1.2098981925658665, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 13710 + }, + { + "epoch": 0.13711, + "grad_norm": 0.9019527402259153, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 13711 + }, + { + "epoch": 0.13712, + "grad_norm": 0.8275757039455064, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 13712 + }, + { + "epoch": 0.13713, + "grad_norm": 0.7899362750168583, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 13713 + }, + { + "epoch": 0.13714, + "grad_norm": 0.8177296214244287, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 13714 + }, + { + "epoch": 0.13715, + "grad_norm": 0.7864591178039964, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 13715 + }, + { + "epoch": 0.13716, + "grad_norm": 0.6699660162112218, + "learning_rate": 0.003, + "loss": 4.037, + "step": 13716 + }, + { + "epoch": 0.13717, + "grad_norm": 0.6090747823320872, + "learning_rate": 0.003, + "loss": 4.064, + "step": 13717 + }, + { + "epoch": 0.13718, + "grad_norm": 0.7320085282083384, + "learning_rate": 0.003, + "loss": 4.081, + "step": 13718 + }, + { + "epoch": 0.13719, + "grad_norm": 0.9370165910644526, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 13719 + }, + { + "epoch": 0.1372, + "grad_norm": 1.18129103744682, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 13720 + }, + { + "epoch": 0.13721, + "grad_norm": 0.7980167825842891, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 13721 + }, + { + "epoch": 0.13722, + "grad_norm": 0.7206463655346406, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 13722 + }, + { + "epoch": 0.13723, + "grad_norm": 0.7338634631935627, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 13723 + }, + { + "epoch": 0.13724, + "grad_norm": 0.7909052635785321, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 13724 + }, + { + "epoch": 0.13725, + "grad_norm": 0.8325526086003372, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 13725 + }, + { + "epoch": 0.13726, + "grad_norm": 0.8030937153949256, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 13726 + }, + { + "epoch": 0.13727, + "grad_norm": 0.8264953386721401, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 13727 + }, + { + "epoch": 0.13728, + "grad_norm": 0.8508217607353825, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 13728 + }, + { + "epoch": 0.13729, + "grad_norm": 0.946430824699291, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 13729 + }, + { + "epoch": 0.1373, + "grad_norm": 1.0914020403010114, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 13730 + }, + { + "epoch": 0.13731, + "grad_norm": 0.9288554160214816, + "learning_rate": 0.003, + "loss": 4.067, + "step": 13731 + }, + { + "epoch": 0.13732, + "grad_norm": 0.8798609706288661, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 13732 + }, + { + "epoch": 0.13733, + "grad_norm": 1.0763765339207674, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 13733 + }, + { + "epoch": 0.13734, + "grad_norm": 1.1094576116187247, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 13734 + }, + { + "epoch": 0.13735, + "grad_norm": 0.72949427796053, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 13735 + }, + { + "epoch": 0.13736, + "grad_norm": 0.6207999180124579, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 13736 + }, + { + "epoch": 0.13737, + "grad_norm": 0.6539839556099966, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 13737 + }, + { + "epoch": 0.13738, + "grad_norm": 0.6969971908777107, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 13738 + }, + { + "epoch": 0.13739, + "grad_norm": 0.7285473962073619, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 13739 + }, + { + "epoch": 0.1374, + "grad_norm": 0.8102903398863351, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 13740 + }, + { + "epoch": 0.13741, + "grad_norm": 1.0221097889574713, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 13741 + }, + { + "epoch": 0.13742, + "grad_norm": 0.9903955434145519, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 13742 + }, + { + "epoch": 0.13743, + "grad_norm": 0.8947687196571883, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 13743 + }, + { + "epoch": 0.13744, + "grad_norm": 0.714024632599002, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 13744 + }, + { + "epoch": 0.13745, + "grad_norm": 0.7498165544155051, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 13745 + }, + { + "epoch": 0.13746, + "grad_norm": 0.8127335402003885, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 13746 + }, + { + "epoch": 0.13747, + "grad_norm": 0.8200482630097694, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 13747 + }, + { + "epoch": 0.13748, + "grad_norm": 0.8783438830484342, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 13748 + }, + { + "epoch": 0.13749, + "grad_norm": 0.8888799915051669, + "learning_rate": 0.003, + "loss": 4.065, + "step": 13749 + }, + { + "epoch": 0.1375, + "grad_norm": 0.9885813087307731, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 13750 + }, + { + "epoch": 0.13751, + "grad_norm": 1.2135856859992784, + "learning_rate": 0.003, + "loss": 4.111, + "step": 13751 + }, + { + "epoch": 0.13752, + "grad_norm": 0.7798500906983413, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 13752 + }, + { + "epoch": 0.13753, + "grad_norm": 0.7356701486495796, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 13753 + }, + { + "epoch": 0.13754, + "grad_norm": 0.9016638345091263, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 13754 + }, + { + "epoch": 0.13755, + "grad_norm": 1.1033413079780579, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 13755 + }, + { + "epoch": 0.13756, + "grad_norm": 0.927188631548279, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 13756 + }, + { + "epoch": 0.13757, + "grad_norm": 0.7729270789457843, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 13757 + }, + { + "epoch": 0.13758, + "grad_norm": 0.7363115802661809, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 13758 + }, + { + "epoch": 0.13759, + "grad_norm": 0.7592211922772076, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 13759 + }, + { + "epoch": 0.1376, + "grad_norm": 0.7446501455003881, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 13760 + }, + { + "epoch": 0.13761, + "grad_norm": 0.7212238566156061, + "learning_rate": 0.003, + "loss": 4.034, + "step": 13761 + }, + { + "epoch": 0.13762, + "grad_norm": 0.7581711637419875, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 13762 + }, + { + "epoch": 0.13763, + "grad_norm": 0.8006710198015181, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 13763 + }, + { + "epoch": 0.13764, + "grad_norm": 0.9664995701855025, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 13764 + }, + { + "epoch": 0.13765, + "grad_norm": 1.1672346999485474, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 13765 + }, + { + "epoch": 0.13766, + "grad_norm": 0.9109583284164834, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 13766 + }, + { + "epoch": 0.13767, + "grad_norm": 0.8452015780168584, + "learning_rate": 0.003, + "loss": 4.054, + "step": 13767 + }, + { + "epoch": 0.13768, + "grad_norm": 0.8119502560786048, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 13768 + }, + { + "epoch": 0.13769, + "grad_norm": 1.0072815571314082, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 13769 + }, + { + "epoch": 0.1377, + "grad_norm": 1.165869816482718, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 13770 + }, + { + "epoch": 0.13771, + "grad_norm": 0.9084630764295751, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 13771 + }, + { + "epoch": 0.13772, + "grad_norm": 0.844520953849131, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 13772 + }, + { + "epoch": 0.13773, + "grad_norm": 0.8726872555065581, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 13773 + }, + { + "epoch": 0.13774, + "grad_norm": 0.7793340636550928, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 13774 + }, + { + "epoch": 0.13775, + "grad_norm": 0.8126996419655287, + "learning_rate": 0.003, + "loss": 4.1034, + "step": 13775 + }, + { + "epoch": 0.13776, + "grad_norm": 0.8801569005034962, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 13776 + }, + { + "epoch": 0.13777, + "grad_norm": 0.9551286417300697, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 13777 + }, + { + "epoch": 0.13778, + "grad_norm": 0.8165579703433642, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 13778 + }, + { + "epoch": 0.13779, + "grad_norm": 0.7789435433001016, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 13779 + }, + { + "epoch": 0.1378, + "grad_norm": 0.7928376917151682, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 13780 + }, + { + "epoch": 0.13781, + "grad_norm": 0.7481096316132387, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 13781 + }, + { + "epoch": 0.13782, + "grad_norm": 0.7777503947233947, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 13782 + }, + { + "epoch": 0.13783, + "grad_norm": 0.8542991874290908, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 13783 + }, + { + "epoch": 0.13784, + "grad_norm": 0.9720862804136864, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 13784 + }, + { + "epoch": 0.13785, + "grad_norm": 1.1154877986375358, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 13785 + }, + { + "epoch": 0.13786, + "grad_norm": 0.9630735815246285, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 13786 + }, + { + "epoch": 0.13787, + "grad_norm": 0.7948417831976556, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 13787 + }, + { + "epoch": 0.13788, + "grad_norm": 0.648743357212664, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 13788 + }, + { + "epoch": 0.13789, + "grad_norm": 0.6241391617920274, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 13789 + }, + { + "epoch": 0.1379, + "grad_norm": 0.5971428516667047, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 13790 + }, + { + "epoch": 0.13791, + "grad_norm": 0.5367881789167211, + "learning_rate": 0.003, + "loss": 4.078, + "step": 13791 + }, + { + "epoch": 0.13792, + "grad_norm": 0.5559013858329298, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 13792 + }, + { + "epoch": 0.13793, + "grad_norm": 0.5767161875300437, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 13793 + }, + { + "epoch": 0.13794, + "grad_norm": 0.5799682646889748, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 13794 + }, + { + "epoch": 0.13795, + "grad_norm": 0.7147042718304768, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 13795 + }, + { + "epoch": 0.13796, + "grad_norm": 0.8674555532184266, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 13796 + }, + { + "epoch": 0.13797, + "grad_norm": 0.9454446297792318, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 13797 + }, + { + "epoch": 0.13798, + "grad_norm": 0.9166854521804426, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 13798 + }, + { + "epoch": 0.13799, + "grad_norm": 1.114005996475773, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 13799 + }, + { + "epoch": 0.138, + "grad_norm": 1.114153798356774, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 13800 + }, + { + "epoch": 0.13801, + "grad_norm": 1.1213716295631775, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 13801 + }, + { + "epoch": 0.13802, + "grad_norm": 0.8689056280736214, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 13802 + }, + { + "epoch": 0.13803, + "grad_norm": 0.7907977137360918, + "learning_rate": 0.003, + "loss": 4.076, + "step": 13803 + }, + { + "epoch": 0.13804, + "grad_norm": 0.8141518765569847, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 13804 + }, + { + "epoch": 0.13805, + "grad_norm": 0.7293812965504043, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 13805 + }, + { + "epoch": 0.13806, + "grad_norm": 0.709596218108507, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 13806 + }, + { + "epoch": 0.13807, + "grad_norm": 0.7848457864855878, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 13807 + }, + { + "epoch": 0.13808, + "grad_norm": 0.8585058692782394, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 13808 + }, + { + "epoch": 0.13809, + "grad_norm": 0.8959977653543688, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 13809 + }, + { + "epoch": 0.1381, + "grad_norm": 0.7245440460794622, + "learning_rate": 0.003, + "loss": 4.047, + "step": 13810 + }, + { + "epoch": 0.13811, + "grad_norm": 0.6400324366048498, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 13811 + }, + { + "epoch": 0.13812, + "grad_norm": 0.6964389374234248, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 13812 + }, + { + "epoch": 0.13813, + "grad_norm": 0.6628085333605657, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 13813 + }, + { + "epoch": 0.13814, + "grad_norm": 0.6258434802751189, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 13814 + }, + { + "epoch": 0.13815, + "grad_norm": 0.6415033432035515, + "learning_rate": 0.003, + "loss": 4.038, + "step": 13815 + }, + { + "epoch": 0.13816, + "grad_norm": 0.7131256086359586, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 13816 + }, + { + "epoch": 0.13817, + "grad_norm": 0.8539536962396401, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 13817 + }, + { + "epoch": 0.13818, + "grad_norm": 1.2659528268865883, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 13818 + }, + { + "epoch": 0.13819, + "grad_norm": 0.9090084828272725, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 13819 + }, + { + "epoch": 0.1382, + "grad_norm": 0.8812080247924137, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 13820 + }, + { + "epoch": 0.13821, + "grad_norm": 1.0378943064189554, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 13821 + }, + { + "epoch": 0.13822, + "grad_norm": 1.1041204426762394, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 13822 + }, + { + "epoch": 0.13823, + "grad_norm": 0.8189568980915312, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 13823 + }, + { + "epoch": 0.13824, + "grad_norm": 0.8858936483460652, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 13824 + }, + { + "epoch": 0.13825, + "grad_norm": 1.1098405437999121, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 13825 + }, + { + "epoch": 0.13826, + "grad_norm": 1.0283270979570271, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 13826 + }, + { + "epoch": 0.13827, + "grad_norm": 0.9376045888039426, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 13827 + }, + { + "epoch": 0.13828, + "grad_norm": 0.9563883985403528, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 13828 + }, + { + "epoch": 0.13829, + "grad_norm": 0.7715868042355665, + "learning_rate": 0.003, + "loss": 4.038, + "step": 13829 + }, + { + "epoch": 0.1383, + "grad_norm": 0.7879364212968953, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 13830 + }, + { + "epoch": 0.13831, + "grad_norm": 0.9418861363077929, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 13831 + }, + { + "epoch": 0.13832, + "grad_norm": 1.1475358778348934, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 13832 + }, + { + "epoch": 0.13833, + "grad_norm": 1.1540925669805697, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 13833 + }, + { + "epoch": 0.13834, + "grad_norm": 0.8875999565754119, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 13834 + }, + { + "epoch": 0.13835, + "grad_norm": 0.6877634991361685, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 13835 + }, + { + "epoch": 0.13836, + "grad_norm": 0.6578700584264132, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 13836 + }, + { + "epoch": 0.13837, + "grad_norm": 0.6227258100422538, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 13837 + }, + { + "epoch": 0.13838, + "grad_norm": 0.7002738833360858, + "learning_rate": 0.003, + "loss": 4.027, + "step": 13838 + }, + { + "epoch": 0.13839, + "grad_norm": 0.749248817125079, + "learning_rate": 0.003, + "loss": 4.08, + "step": 13839 + }, + { + "epoch": 0.1384, + "grad_norm": 0.9289441131530395, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 13840 + }, + { + "epoch": 0.13841, + "grad_norm": 1.1013445196068063, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 13841 + }, + { + "epoch": 0.13842, + "grad_norm": 0.932321072376433, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 13842 + }, + { + "epoch": 0.13843, + "grad_norm": 0.977524130860534, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 13843 + }, + { + "epoch": 0.13844, + "grad_norm": 1.0408277592904422, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 13844 + }, + { + "epoch": 0.13845, + "grad_norm": 1.1276841668984763, + "learning_rate": 0.003, + "loss": 4.09, + "step": 13845 + }, + { + "epoch": 0.13846, + "grad_norm": 0.9033755025070581, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 13846 + }, + { + "epoch": 0.13847, + "grad_norm": 0.8063325033290983, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 13847 + }, + { + "epoch": 0.13848, + "grad_norm": 0.7751890589883782, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 13848 + }, + { + "epoch": 0.13849, + "grad_norm": 0.7130523394391949, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 13849 + }, + { + "epoch": 0.1385, + "grad_norm": 0.7422819135138875, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 13850 + }, + { + "epoch": 0.13851, + "grad_norm": 0.772559891336259, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 13851 + }, + { + "epoch": 0.13852, + "grad_norm": 1.0305335746506892, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 13852 + }, + { + "epoch": 0.13853, + "grad_norm": 1.0686821057604816, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 13853 + }, + { + "epoch": 0.13854, + "grad_norm": 0.8617161136862561, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 13854 + }, + { + "epoch": 0.13855, + "grad_norm": 0.8959961298539898, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 13855 + }, + { + "epoch": 0.13856, + "grad_norm": 0.8942780945647872, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 13856 + }, + { + "epoch": 0.13857, + "grad_norm": 0.9724266400760307, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 13857 + }, + { + "epoch": 0.13858, + "grad_norm": 0.892095929966666, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 13858 + }, + { + "epoch": 0.13859, + "grad_norm": 0.7845456057332171, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 13859 + }, + { + "epoch": 0.1386, + "grad_norm": 0.7863202510170799, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 13860 + }, + { + "epoch": 0.13861, + "grad_norm": 0.724850270661534, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 13861 + }, + { + "epoch": 0.13862, + "grad_norm": 0.682626036200262, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 13862 + }, + { + "epoch": 0.13863, + "grad_norm": 0.7313618271309567, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 13863 + }, + { + "epoch": 0.13864, + "grad_norm": 0.62749590097859, + "learning_rate": 0.003, + "loss": 4.09, + "step": 13864 + }, + { + "epoch": 0.13865, + "grad_norm": 0.6049866909369687, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 13865 + }, + { + "epoch": 0.13866, + "grad_norm": 0.9049127148877567, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 13866 + }, + { + "epoch": 0.13867, + "grad_norm": 1.3325377235073774, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 13867 + }, + { + "epoch": 0.13868, + "grad_norm": 0.7819830335163918, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 13868 + }, + { + "epoch": 0.13869, + "grad_norm": 0.6834694667388045, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 13869 + }, + { + "epoch": 0.1387, + "grad_norm": 0.6118272056142949, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 13870 + }, + { + "epoch": 0.13871, + "grad_norm": 0.6834278316785658, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 13871 + }, + { + "epoch": 0.13872, + "grad_norm": 0.9213148671100461, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 13872 + }, + { + "epoch": 0.13873, + "grad_norm": 1.0221627581595059, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 13873 + }, + { + "epoch": 0.13874, + "grad_norm": 1.0267626825893557, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 13874 + }, + { + "epoch": 0.13875, + "grad_norm": 0.8997216322344997, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 13875 + }, + { + "epoch": 0.13876, + "grad_norm": 0.904524049711079, + "learning_rate": 0.003, + "loss": 4.089, + "step": 13876 + }, + { + "epoch": 0.13877, + "grad_norm": 1.075224551654725, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 13877 + }, + { + "epoch": 0.13878, + "grad_norm": 0.8739164555337202, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 13878 + }, + { + "epoch": 0.13879, + "grad_norm": 0.7676271792656019, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 13879 + }, + { + "epoch": 0.1388, + "grad_norm": 0.8698458064329295, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 13880 + }, + { + "epoch": 0.13881, + "grad_norm": 0.997623534384448, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 13881 + }, + { + "epoch": 0.13882, + "grad_norm": 1.0346764527746597, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 13882 + }, + { + "epoch": 0.13883, + "grad_norm": 1.146461851359024, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 13883 + }, + { + "epoch": 0.13884, + "grad_norm": 0.9582886184544946, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 13884 + }, + { + "epoch": 0.13885, + "grad_norm": 1.120398293538235, + "learning_rate": 0.003, + "loss": 4.122, + "step": 13885 + }, + { + "epoch": 0.13886, + "grad_norm": 0.9425712394898523, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 13886 + }, + { + "epoch": 0.13887, + "grad_norm": 0.8458723555471758, + "learning_rate": 0.003, + "loss": 4.067, + "step": 13887 + }, + { + "epoch": 0.13888, + "grad_norm": 0.7586531946489982, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 13888 + }, + { + "epoch": 0.13889, + "grad_norm": 0.8178178766092938, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 13889 + }, + { + "epoch": 0.1389, + "grad_norm": 1.0274999663987239, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 13890 + }, + { + "epoch": 0.13891, + "grad_norm": 1.088422536589401, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 13891 + }, + { + "epoch": 0.13892, + "grad_norm": 0.9934566817900852, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 13892 + }, + { + "epoch": 0.13893, + "grad_norm": 1.0979690315596429, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 13893 + }, + { + "epoch": 0.13894, + "grad_norm": 0.7359317410490038, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 13894 + }, + { + "epoch": 0.13895, + "grad_norm": 0.6660798775406379, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 13895 + }, + { + "epoch": 0.13896, + "grad_norm": 0.6068604475042304, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 13896 + }, + { + "epoch": 0.13897, + "grad_norm": 0.5554031840270844, + "learning_rate": 0.003, + "loss": 4.067, + "step": 13897 + }, + { + "epoch": 0.13898, + "grad_norm": 0.5343212773662731, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 13898 + }, + { + "epoch": 0.13899, + "grad_norm": 0.5942944529885672, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 13899 + }, + { + "epoch": 0.139, + "grad_norm": 0.5655268819606185, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 13900 + }, + { + "epoch": 0.13901, + "grad_norm": 0.5735486715388576, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 13901 + }, + { + "epoch": 0.13902, + "grad_norm": 0.5578141470648175, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 13902 + }, + { + "epoch": 0.13903, + "grad_norm": 0.5952485590520994, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 13903 + }, + { + "epoch": 0.13904, + "grad_norm": 0.702839100454558, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 13904 + }, + { + "epoch": 0.13905, + "grad_norm": 0.9194180864002266, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 13905 + }, + { + "epoch": 0.13906, + "grad_norm": 0.9918127068613594, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 13906 + }, + { + "epoch": 0.13907, + "grad_norm": 0.9982642946562648, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 13907 + }, + { + "epoch": 0.13908, + "grad_norm": 1.1028592223759681, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 13908 + }, + { + "epoch": 0.13909, + "grad_norm": 0.9119818024419278, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 13909 + }, + { + "epoch": 0.1391, + "grad_norm": 0.9586893804229246, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 13910 + }, + { + "epoch": 0.13911, + "grad_norm": 1.0439638832992049, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 13911 + }, + { + "epoch": 0.13912, + "grad_norm": 0.8210400684919287, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 13912 + }, + { + "epoch": 0.13913, + "grad_norm": 0.8086768694094838, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 13913 + }, + { + "epoch": 0.13914, + "grad_norm": 0.8101091875046591, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 13914 + }, + { + "epoch": 0.13915, + "grad_norm": 0.8725203305820736, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 13915 + }, + { + "epoch": 0.13916, + "grad_norm": 0.8990905891535143, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 13916 + }, + { + "epoch": 0.13917, + "grad_norm": 0.9909167123501083, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 13917 + }, + { + "epoch": 0.13918, + "grad_norm": 1.0230599514140737, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 13918 + }, + { + "epoch": 0.13919, + "grad_norm": 1.07078297100314, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 13919 + }, + { + "epoch": 0.1392, + "grad_norm": 1.1701770532662963, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 13920 + }, + { + "epoch": 0.13921, + "grad_norm": 1.125390222000617, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 13921 + }, + { + "epoch": 0.13922, + "grad_norm": 1.1757748833898825, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 13922 + }, + { + "epoch": 0.13923, + "grad_norm": 0.9656144153356182, + "learning_rate": 0.003, + "loss": 4.1, + "step": 13923 + }, + { + "epoch": 0.13924, + "grad_norm": 0.9040822950204922, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 13924 + }, + { + "epoch": 0.13925, + "grad_norm": 0.9315380370175754, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 13925 + }, + { + "epoch": 0.13926, + "grad_norm": 0.8737668104257097, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 13926 + }, + { + "epoch": 0.13927, + "grad_norm": 0.8291343492556099, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 13927 + }, + { + "epoch": 0.13928, + "grad_norm": 0.7113404829726243, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 13928 + }, + { + "epoch": 0.13929, + "grad_norm": 0.741200172950455, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 13929 + }, + { + "epoch": 0.1393, + "grad_norm": 0.6613773919670253, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 13930 + }, + { + "epoch": 0.13931, + "grad_norm": 0.6653353843683376, + "learning_rate": 0.003, + "loss": 4.006, + "step": 13931 + }, + { + "epoch": 0.13932, + "grad_norm": 0.6287574812459211, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 13932 + }, + { + "epoch": 0.13933, + "grad_norm": 0.5674413496101179, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 13933 + }, + { + "epoch": 0.13934, + "grad_norm": 0.5880944332131156, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 13934 + }, + { + "epoch": 0.13935, + "grad_norm": 0.6342207765203128, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 13935 + }, + { + "epoch": 0.13936, + "grad_norm": 0.6101277789479459, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 13936 + }, + { + "epoch": 0.13937, + "grad_norm": 0.6865857511421336, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 13937 + }, + { + "epoch": 0.13938, + "grad_norm": 0.854569847730593, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 13938 + }, + { + "epoch": 0.13939, + "grad_norm": 1.0856281334857132, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 13939 + }, + { + "epoch": 0.1394, + "grad_norm": 1.147959372718614, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 13940 + }, + { + "epoch": 0.13941, + "grad_norm": 0.8908493754537409, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 13941 + }, + { + "epoch": 0.13942, + "grad_norm": 0.9028758990435468, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 13942 + }, + { + "epoch": 0.13943, + "grad_norm": 0.9649037667031776, + "learning_rate": 0.003, + "loss": 4.1223, + "step": 13943 + }, + { + "epoch": 0.13944, + "grad_norm": 0.9874675628318524, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 13944 + }, + { + "epoch": 0.13945, + "grad_norm": 1.018584331238473, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 13945 + }, + { + "epoch": 0.13946, + "grad_norm": 0.9947494917345606, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 13946 + }, + { + "epoch": 0.13947, + "grad_norm": 0.9749607135922929, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 13947 + }, + { + "epoch": 0.13948, + "grad_norm": 0.9238473106107602, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 13948 + }, + { + "epoch": 0.13949, + "grad_norm": 0.9349268951940659, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 13949 + }, + { + "epoch": 0.1395, + "grad_norm": 1.0428619217148167, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 13950 + }, + { + "epoch": 0.13951, + "grad_norm": 0.8459673684923922, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 13951 + }, + { + "epoch": 0.13952, + "grad_norm": 0.6852193986489556, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 13952 + }, + { + "epoch": 0.13953, + "grad_norm": 0.617506539409373, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 13953 + }, + { + "epoch": 0.13954, + "grad_norm": 0.6406273910256604, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 13954 + }, + { + "epoch": 0.13955, + "grad_norm": 0.7470615445445485, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 13955 + }, + { + "epoch": 0.13956, + "grad_norm": 0.8323240315799879, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 13956 + }, + { + "epoch": 0.13957, + "grad_norm": 0.9840882166301426, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 13957 + }, + { + "epoch": 0.13958, + "grad_norm": 1.1136670107251858, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 13958 + }, + { + "epoch": 0.13959, + "grad_norm": 0.7949677641025017, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 13959 + }, + { + "epoch": 0.1396, + "grad_norm": 0.7616451119844375, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 13960 + }, + { + "epoch": 0.13961, + "grad_norm": 0.7505373073112688, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 13961 + }, + { + "epoch": 0.13962, + "grad_norm": 0.7526777815873916, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 13962 + }, + { + "epoch": 0.13963, + "grad_norm": 0.6195642763313239, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 13963 + }, + { + "epoch": 0.13964, + "grad_norm": 0.5757858013174563, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 13964 + }, + { + "epoch": 0.13965, + "grad_norm": 0.581188960888462, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 13965 + }, + { + "epoch": 0.13966, + "grad_norm": 0.634007454821538, + "learning_rate": 0.003, + "loss": 4.061, + "step": 13966 + }, + { + "epoch": 0.13967, + "grad_norm": 0.7685363948603434, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 13967 + }, + { + "epoch": 0.13968, + "grad_norm": 0.8791380930040772, + "learning_rate": 0.003, + "loss": 4.032, + "step": 13968 + }, + { + "epoch": 0.13969, + "grad_norm": 0.9370012053971709, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 13969 + }, + { + "epoch": 0.1397, + "grad_norm": 1.007104403630713, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 13970 + }, + { + "epoch": 0.13971, + "grad_norm": 1.1017121204227867, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 13971 + }, + { + "epoch": 0.13972, + "grad_norm": 0.9215751691915052, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 13972 + }, + { + "epoch": 0.13973, + "grad_norm": 1.0149330901881009, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 13973 + }, + { + "epoch": 0.13974, + "grad_norm": 1.2031116163852842, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 13974 + }, + { + "epoch": 0.13975, + "grad_norm": 0.9048277045744538, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 13975 + }, + { + "epoch": 0.13976, + "grad_norm": 0.8467577858000327, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 13976 + }, + { + "epoch": 0.13977, + "grad_norm": 0.9347928724178286, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 13977 + }, + { + "epoch": 0.13978, + "grad_norm": 0.9441370106156344, + "learning_rate": 0.003, + "loss": 4.1319, + "step": 13978 + }, + { + "epoch": 0.13979, + "grad_norm": 0.9813737971909722, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 13979 + }, + { + "epoch": 0.1398, + "grad_norm": 1.0800987666044952, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 13980 + }, + { + "epoch": 0.13981, + "grad_norm": 0.9155655036171332, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 13981 + }, + { + "epoch": 0.13982, + "grad_norm": 0.9348220152415775, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 13982 + }, + { + "epoch": 0.13983, + "grad_norm": 0.8703325281705804, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 13983 + }, + { + "epoch": 0.13984, + "grad_norm": 0.8021340331951412, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 13984 + }, + { + "epoch": 0.13985, + "grad_norm": 0.8026066895289523, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 13985 + }, + { + "epoch": 0.13986, + "grad_norm": 0.80465103900619, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 13986 + }, + { + "epoch": 0.13987, + "grad_norm": 0.782418310682838, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 13987 + }, + { + "epoch": 0.13988, + "grad_norm": 0.7134729135125821, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 13988 + }, + { + "epoch": 0.13989, + "grad_norm": 0.7007064251116866, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 13989 + }, + { + "epoch": 0.1399, + "grad_norm": 0.934409382302109, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 13990 + }, + { + "epoch": 0.13991, + "grad_norm": 1.0791515335095017, + "learning_rate": 0.003, + "loss": 4.083, + "step": 13991 + }, + { + "epoch": 0.13992, + "grad_norm": 0.9650430756826706, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 13992 + }, + { + "epoch": 0.13993, + "grad_norm": 0.8339000310403641, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 13993 + }, + { + "epoch": 0.13994, + "grad_norm": 0.7723832134323424, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 13994 + }, + { + "epoch": 0.13995, + "grad_norm": 0.7469181907748006, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 13995 + }, + { + "epoch": 0.13996, + "grad_norm": 0.6762581253213434, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 13996 + }, + { + "epoch": 0.13997, + "grad_norm": 0.6616456388943689, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 13997 + }, + { + "epoch": 0.13998, + "grad_norm": 0.7101556519082999, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 13998 + }, + { + "epoch": 0.13999, + "grad_norm": 0.7122169886672998, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 13999 + }, + { + "epoch": 0.14, + "grad_norm": 0.6968307619173715, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 14000 + }, + { + "epoch": 0.14001, + "grad_norm": 0.698056742340295, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 14001 + }, + { + "epoch": 0.14002, + "grad_norm": 0.7907075724163473, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 14002 + }, + { + "epoch": 0.14003, + "grad_norm": 0.7428160597908467, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 14003 + }, + { + "epoch": 0.14004, + "grad_norm": 0.6858756472286215, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 14004 + }, + { + "epoch": 0.14005, + "grad_norm": 0.92068368447873, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 14005 + }, + { + "epoch": 0.14006, + "grad_norm": 1.1320609772883197, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 14006 + }, + { + "epoch": 0.14007, + "grad_norm": 1.0124595191225807, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 14007 + }, + { + "epoch": 0.14008, + "grad_norm": 1.1553992392890877, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 14008 + }, + { + "epoch": 0.14009, + "grad_norm": 0.9028931663771032, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 14009 + }, + { + "epoch": 0.1401, + "grad_norm": 0.9385550516447129, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 14010 + }, + { + "epoch": 0.14011, + "grad_norm": 0.8041275843331671, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 14011 + }, + { + "epoch": 0.14012, + "grad_norm": 0.7327470492759001, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 14012 + }, + { + "epoch": 0.14013, + "grad_norm": 0.7791426325933855, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 14013 + }, + { + "epoch": 0.14014, + "grad_norm": 0.7998075523884633, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 14014 + }, + { + "epoch": 0.14015, + "grad_norm": 0.7834602187439688, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 14015 + }, + { + "epoch": 0.14016, + "grad_norm": 0.8049002907527643, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 14016 + }, + { + "epoch": 0.14017, + "grad_norm": 0.8442423762394555, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 14017 + }, + { + "epoch": 0.14018, + "grad_norm": 0.8949825666045306, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 14018 + }, + { + "epoch": 0.14019, + "grad_norm": 0.9529272290674913, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 14019 + }, + { + "epoch": 0.1402, + "grad_norm": 1.0588908963705896, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 14020 + }, + { + "epoch": 0.14021, + "grad_norm": 0.8502772012425337, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 14021 + }, + { + "epoch": 0.14022, + "grad_norm": 0.8729125321703824, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 14022 + }, + { + "epoch": 0.14023, + "grad_norm": 0.9281444206531265, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 14023 + }, + { + "epoch": 0.14024, + "grad_norm": 0.9767142603182802, + "learning_rate": 0.003, + "loss": 4.083, + "step": 14024 + }, + { + "epoch": 0.14025, + "grad_norm": 0.8811350534417398, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 14025 + }, + { + "epoch": 0.14026, + "grad_norm": 0.7731178104895816, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 14026 + }, + { + "epoch": 0.14027, + "grad_norm": 0.8121738878256008, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 14027 + }, + { + "epoch": 0.14028, + "grad_norm": 1.0527593861995281, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 14028 + }, + { + "epoch": 0.14029, + "grad_norm": 1.0364354641585516, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 14029 + }, + { + "epoch": 0.1403, + "grad_norm": 0.9356382728995162, + "learning_rate": 0.003, + "loss": 4.071, + "step": 14030 + }, + { + "epoch": 0.14031, + "grad_norm": 0.9734215622713746, + "learning_rate": 0.003, + "loss": 4.062, + "step": 14031 + }, + { + "epoch": 0.14032, + "grad_norm": 0.9221041776060591, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 14032 + }, + { + "epoch": 0.14033, + "grad_norm": 0.942392631432111, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 14033 + }, + { + "epoch": 0.14034, + "grad_norm": 0.953675799144503, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 14034 + }, + { + "epoch": 0.14035, + "grad_norm": 1.0798346096269986, + "learning_rate": 0.003, + "loss": 4.1119, + "step": 14035 + }, + { + "epoch": 0.14036, + "grad_norm": 0.768493447080825, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 14036 + }, + { + "epoch": 0.14037, + "grad_norm": 0.6658366423896053, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 14037 + }, + { + "epoch": 0.14038, + "grad_norm": 0.6700488420832278, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 14038 + }, + { + "epoch": 0.14039, + "grad_norm": 0.6994751424346235, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 14039 + }, + { + "epoch": 0.1404, + "grad_norm": 0.6805216234807403, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 14040 + }, + { + "epoch": 0.14041, + "grad_norm": 0.7096213198173723, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 14041 + }, + { + "epoch": 0.14042, + "grad_norm": 0.9024095684360148, + "learning_rate": 0.003, + "loss": 4.088, + "step": 14042 + }, + { + "epoch": 0.14043, + "grad_norm": 1.004987005227543, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 14043 + }, + { + "epoch": 0.14044, + "grad_norm": 0.8703713622281967, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 14044 + }, + { + "epoch": 0.14045, + "grad_norm": 0.8162368451407708, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 14045 + }, + { + "epoch": 0.14046, + "grad_norm": 0.822083556580899, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 14046 + }, + { + "epoch": 0.14047, + "grad_norm": 1.0099070720931629, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 14047 + }, + { + "epoch": 0.14048, + "grad_norm": 1.2234046385348645, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 14048 + }, + { + "epoch": 0.14049, + "grad_norm": 1.0961981256367888, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 14049 + }, + { + "epoch": 0.1405, + "grad_norm": 0.9805499525283211, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 14050 + }, + { + "epoch": 0.14051, + "grad_norm": 0.8895750604840732, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 14051 + }, + { + "epoch": 0.14052, + "grad_norm": 0.7226459377749787, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 14052 + }, + { + "epoch": 0.14053, + "grad_norm": 0.6821221410680418, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 14053 + }, + { + "epoch": 0.14054, + "grad_norm": 0.6815291345070602, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 14054 + }, + { + "epoch": 0.14055, + "grad_norm": 0.6225126205146272, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 14055 + }, + { + "epoch": 0.14056, + "grad_norm": 0.5848437124738132, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 14056 + }, + { + "epoch": 0.14057, + "grad_norm": 0.6192948262227931, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 14057 + }, + { + "epoch": 0.14058, + "grad_norm": 0.6817486571460282, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 14058 + }, + { + "epoch": 0.14059, + "grad_norm": 0.8923821241111431, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 14059 + }, + { + "epoch": 0.1406, + "grad_norm": 1.0556781965538722, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 14060 + }, + { + "epoch": 0.14061, + "grad_norm": 0.9167875500653558, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 14061 + }, + { + "epoch": 0.14062, + "grad_norm": 0.7317744511706858, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 14062 + }, + { + "epoch": 0.14063, + "grad_norm": 0.6455136777817637, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 14063 + }, + { + "epoch": 0.14064, + "grad_norm": 0.611610992771704, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 14064 + }, + { + "epoch": 0.14065, + "grad_norm": 0.6983464827194192, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 14065 + }, + { + "epoch": 0.14066, + "grad_norm": 0.7025104082890304, + "learning_rate": 0.003, + "loss": 4.064, + "step": 14066 + }, + { + "epoch": 0.14067, + "grad_norm": 0.7056840136314639, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 14067 + }, + { + "epoch": 0.14068, + "grad_norm": 0.7171403191106246, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 14068 + }, + { + "epoch": 0.14069, + "grad_norm": 0.7964383532066432, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 14069 + }, + { + "epoch": 0.1407, + "grad_norm": 0.992490117489188, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 14070 + }, + { + "epoch": 0.14071, + "grad_norm": 1.1888031121284164, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 14071 + }, + { + "epoch": 0.14072, + "grad_norm": 0.7550982764018704, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 14072 + }, + { + "epoch": 0.14073, + "grad_norm": 0.7178006563476692, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 14073 + }, + { + "epoch": 0.14074, + "grad_norm": 0.6979945759359226, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 14074 + }, + { + "epoch": 0.14075, + "grad_norm": 0.7517115145725634, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 14075 + }, + { + "epoch": 0.14076, + "grad_norm": 0.8971750250500409, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 14076 + }, + { + "epoch": 0.14077, + "grad_norm": 1.0061786676270656, + "learning_rate": 0.003, + "loss": 4.064, + "step": 14077 + }, + { + "epoch": 0.14078, + "grad_norm": 1.0662018235332384, + "learning_rate": 0.003, + "loss": 4.075, + "step": 14078 + }, + { + "epoch": 0.14079, + "grad_norm": 1.001016966281043, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 14079 + }, + { + "epoch": 0.1408, + "grad_norm": 1.0905427139186263, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 14080 + }, + { + "epoch": 0.14081, + "grad_norm": 0.8601233047418335, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 14081 + }, + { + "epoch": 0.14082, + "grad_norm": 0.8464053360349869, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 14082 + }, + { + "epoch": 0.14083, + "grad_norm": 0.8163832757468629, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 14083 + }, + { + "epoch": 0.14084, + "grad_norm": 0.8979789297652752, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 14084 + }, + { + "epoch": 0.14085, + "grad_norm": 0.9452687142481188, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 14085 + }, + { + "epoch": 0.14086, + "grad_norm": 1.2115870138728335, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 14086 + }, + { + "epoch": 0.14087, + "grad_norm": 0.8309684268595496, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 14087 + }, + { + "epoch": 0.14088, + "grad_norm": 0.6117436484071088, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 14088 + }, + { + "epoch": 0.14089, + "grad_norm": 0.7118110256820298, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 14089 + }, + { + "epoch": 0.1409, + "grad_norm": 0.8719998742617723, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 14090 + }, + { + "epoch": 0.14091, + "grad_norm": 1.0052133984711733, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 14091 + }, + { + "epoch": 0.14092, + "grad_norm": 0.9740470677250189, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 14092 + }, + { + "epoch": 0.14093, + "grad_norm": 1.0051493119311377, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 14093 + }, + { + "epoch": 0.14094, + "grad_norm": 0.9458928117251939, + "learning_rate": 0.003, + "loss": 4.046, + "step": 14094 + }, + { + "epoch": 0.14095, + "grad_norm": 0.975823588806917, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 14095 + }, + { + "epoch": 0.14096, + "grad_norm": 0.9906952928170704, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 14096 + }, + { + "epoch": 0.14097, + "grad_norm": 0.8720130556605404, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 14097 + }, + { + "epoch": 0.14098, + "grad_norm": 0.7929794084939011, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 14098 + }, + { + "epoch": 0.14099, + "grad_norm": 0.7608051857301364, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 14099 + }, + { + "epoch": 0.141, + "grad_norm": 0.6072227619693424, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 14100 + }, + { + "epoch": 0.14101, + "grad_norm": 0.6354583908656244, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 14101 + }, + { + "epoch": 0.14102, + "grad_norm": 0.7648589924969916, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 14102 + }, + { + "epoch": 0.14103, + "grad_norm": 0.75073125925008, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 14103 + }, + { + "epoch": 0.14104, + "grad_norm": 0.8944629521861538, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 14104 + }, + { + "epoch": 0.14105, + "grad_norm": 1.1252243979552374, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 14105 + }, + { + "epoch": 0.14106, + "grad_norm": 0.9598877414388742, + "learning_rate": 0.003, + "loss": 4.072, + "step": 14106 + }, + { + "epoch": 0.14107, + "grad_norm": 0.8757505916312833, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 14107 + }, + { + "epoch": 0.14108, + "grad_norm": 0.8919262266661061, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 14108 + }, + { + "epoch": 0.14109, + "grad_norm": 0.863228884529217, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 14109 + }, + { + "epoch": 0.1411, + "grad_norm": 0.905161408561225, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 14110 + }, + { + "epoch": 0.14111, + "grad_norm": 0.8490438955584823, + "learning_rate": 0.003, + "loss": 4.085, + "step": 14111 + }, + { + "epoch": 0.14112, + "grad_norm": 0.8766583441383862, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 14112 + }, + { + "epoch": 0.14113, + "grad_norm": 0.8638268884569851, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 14113 + }, + { + "epoch": 0.14114, + "grad_norm": 0.8155943482946549, + "learning_rate": 0.003, + "loss": 4.076, + "step": 14114 + }, + { + "epoch": 0.14115, + "grad_norm": 0.8161826207286162, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 14115 + }, + { + "epoch": 0.14116, + "grad_norm": 0.8791333392131072, + "learning_rate": 0.003, + "loss": 4.077, + "step": 14116 + }, + { + "epoch": 0.14117, + "grad_norm": 1.0696105564074323, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 14117 + }, + { + "epoch": 0.14118, + "grad_norm": 0.8707842677857357, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 14118 + }, + { + "epoch": 0.14119, + "grad_norm": 0.7663916898824941, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 14119 + }, + { + "epoch": 0.1412, + "grad_norm": 0.6122112788474803, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 14120 + }, + { + "epoch": 0.14121, + "grad_norm": 0.6259906539234324, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 14121 + }, + { + "epoch": 0.14122, + "grad_norm": 0.7133645033810277, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 14122 + }, + { + "epoch": 0.14123, + "grad_norm": 0.781718056704064, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 14123 + }, + { + "epoch": 0.14124, + "grad_norm": 0.8639987865513068, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 14124 + }, + { + "epoch": 0.14125, + "grad_norm": 1.0610918576022226, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 14125 + }, + { + "epoch": 0.14126, + "grad_norm": 1.1447163595017156, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 14126 + }, + { + "epoch": 0.14127, + "grad_norm": 0.8163293324630697, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 14127 + }, + { + "epoch": 0.14128, + "grad_norm": 0.7343315992884264, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 14128 + }, + { + "epoch": 0.14129, + "grad_norm": 0.7480710336682936, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 14129 + }, + { + "epoch": 0.1413, + "grad_norm": 0.7368728548340789, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 14130 + }, + { + "epoch": 0.14131, + "grad_norm": 0.9105225962776399, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 14131 + }, + { + "epoch": 0.14132, + "grad_norm": 1.1385488373812027, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 14132 + }, + { + "epoch": 0.14133, + "grad_norm": 0.913119046010273, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 14133 + }, + { + "epoch": 0.14134, + "grad_norm": 0.7657609624981269, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 14134 + }, + { + "epoch": 0.14135, + "grad_norm": 0.7609010398091507, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 14135 + }, + { + "epoch": 0.14136, + "grad_norm": 0.8340366546380981, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 14136 + }, + { + "epoch": 0.14137, + "grad_norm": 0.87974034458899, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 14137 + }, + { + "epoch": 0.14138, + "grad_norm": 0.9060336588360559, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 14138 + }, + { + "epoch": 0.14139, + "grad_norm": 0.9785030316979524, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 14139 + }, + { + "epoch": 0.1414, + "grad_norm": 1.0251792333533782, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 14140 + }, + { + "epoch": 0.14141, + "grad_norm": 1.0165317489503691, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 14141 + }, + { + "epoch": 0.14142, + "grad_norm": 1.1538977045900214, + "learning_rate": 0.003, + "loss": 4.073, + "step": 14142 + }, + { + "epoch": 0.14143, + "grad_norm": 1.0788233762164792, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 14143 + }, + { + "epoch": 0.14144, + "grad_norm": 0.9010833889002156, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 14144 + }, + { + "epoch": 0.14145, + "grad_norm": 0.9613137075857177, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 14145 + }, + { + "epoch": 0.14146, + "grad_norm": 1.0433242475261193, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 14146 + }, + { + "epoch": 0.14147, + "grad_norm": 1.0346052233749885, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 14147 + }, + { + "epoch": 0.14148, + "grad_norm": 1.0047935776327652, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 14148 + }, + { + "epoch": 0.14149, + "grad_norm": 1.0070191874423993, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 14149 + }, + { + "epoch": 0.1415, + "grad_norm": 0.9403234730370085, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 14150 + }, + { + "epoch": 0.14151, + "grad_norm": 0.9340113204611356, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 14151 + }, + { + "epoch": 0.14152, + "grad_norm": 0.9095788780203912, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 14152 + }, + { + "epoch": 0.14153, + "grad_norm": 0.9235734595743118, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 14153 + }, + { + "epoch": 0.14154, + "grad_norm": 0.8878509247876392, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 14154 + }, + { + "epoch": 0.14155, + "grad_norm": 0.8436709962119316, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 14155 + }, + { + "epoch": 0.14156, + "grad_norm": 0.7725028756807933, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 14156 + }, + { + "epoch": 0.14157, + "grad_norm": 0.7607021345434494, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 14157 + }, + { + "epoch": 0.14158, + "grad_norm": 0.7837363952161858, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 14158 + }, + { + "epoch": 0.14159, + "grad_norm": 0.7377747534018623, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 14159 + }, + { + "epoch": 0.1416, + "grad_norm": 0.8685123610464288, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 14160 + }, + { + "epoch": 0.14161, + "grad_norm": 1.05678529283597, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 14161 + }, + { + "epoch": 0.14162, + "grad_norm": 1.3045732595480373, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 14162 + }, + { + "epoch": 0.14163, + "grad_norm": 0.6138736597121288, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 14163 + }, + { + "epoch": 0.14164, + "grad_norm": 0.6664174437649203, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 14164 + }, + { + "epoch": 0.14165, + "grad_norm": 0.7351929638196077, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 14165 + }, + { + "epoch": 0.14166, + "grad_norm": 0.6837082944117285, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 14166 + }, + { + "epoch": 0.14167, + "grad_norm": 0.6485593388343944, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 14167 + }, + { + "epoch": 0.14168, + "grad_norm": 0.6502219497216053, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 14168 + }, + { + "epoch": 0.14169, + "grad_norm": 0.6530460950303256, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 14169 + }, + { + "epoch": 0.1417, + "grad_norm": 0.5314371458766793, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 14170 + }, + { + "epoch": 0.14171, + "grad_norm": 0.5397748506892242, + "learning_rate": 0.003, + "loss": 4.07, + "step": 14171 + }, + { + "epoch": 0.14172, + "grad_norm": 0.5907464608832607, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 14172 + }, + { + "epoch": 0.14173, + "grad_norm": 0.5760272288509668, + "learning_rate": 0.003, + "loss": 4.071, + "step": 14173 + }, + { + "epoch": 0.14174, + "grad_norm": 0.7023072638492589, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 14174 + }, + { + "epoch": 0.14175, + "grad_norm": 1.0348162640920575, + "learning_rate": 0.003, + "loss": 4.086, + "step": 14175 + }, + { + "epoch": 0.14176, + "grad_norm": 1.4191969408141354, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 14176 + }, + { + "epoch": 0.14177, + "grad_norm": 0.5762605622983992, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 14177 + }, + { + "epoch": 0.14178, + "grad_norm": 0.8349905059365021, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 14178 + }, + { + "epoch": 0.14179, + "grad_norm": 1.0238856329640418, + "learning_rate": 0.003, + "loss": 4.083, + "step": 14179 + }, + { + "epoch": 0.1418, + "grad_norm": 0.8958196113982968, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 14180 + }, + { + "epoch": 0.14181, + "grad_norm": 0.8488290140329743, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 14181 + }, + { + "epoch": 0.14182, + "grad_norm": 0.85594951057921, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 14182 + }, + { + "epoch": 0.14183, + "grad_norm": 0.8445824988413334, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 14183 + }, + { + "epoch": 0.14184, + "grad_norm": 1.0823833226303805, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 14184 + }, + { + "epoch": 0.14185, + "grad_norm": 1.2134570167918746, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 14185 + }, + { + "epoch": 0.14186, + "grad_norm": 0.8395744683994786, + "learning_rate": 0.003, + "loss": 4.074, + "step": 14186 + }, + { + "epoch": 0.14187, + "grad_norm": 0.736679291662909, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 14187 + }, + { + "epoch": 0.14188, + "grad_norm": 0.8722552700424218, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 14188 + }, + { + "epoch": 0.14189, + "grad_norm": 0.7933914534797809, + "learning_rate": 0.003, + "loss": 4.065, + "step": 14189 + }, + { + "epoch": 0.1419, + "grad_norm": 0.7511860515790721, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 14190 + }, + { + "epoch": 0.14191, + "grad_norm": 0.7577322350020178, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 14191 + }, + { + "epoch": 0.14192, + "grad_norm": 0.7852520407334463, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 14192 + }, + { + "epoch": 0.14193, + "grad_norm": 0.8740960536673468, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 14193 + }, + { + "epoch": 0.14194, + "grad_norm": 0.9454865942734588, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 14194 + }, + { + "epoch": 0.14195, + "grad_norm": 1.0260824379078455, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 14195 + }, + { + "epoch": 0.14196, + "grad_norm": 1.128710367507716, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 14196 + }, + { + "epoch": 0.14197, + "grad_norm": 0.8702706333594432, + "learning_rate": 0.003, + "loss": 4.075, + "step": 14197 + }, + { + "epoch": 0.14198, + "grad_norm": 0.855801841251651, + "learning_rate": 0.003, + "loss": 4.074, + "step": 14198 + }, + { + "epoch": 0.14199, + "grad_norm": 0.79570611148028, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 14199 + }, + { + "epoch": 0.142, + "grad_norm": 0.7762153897839794, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 14200 + }, + { + "epoch": 0.14201, + "grad_norm": 0.8313138130976775, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 14201 + }, + { + "epoch": 0.14202, + "grad_norm": 0.9102058475281254, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 14202 + }, + { + "epoch": 0.14203, + "grad_norm": 0.9879444850830352, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 14203 + }, + { + "epoch": 0.14204, + "grad_norm": 0.9727375382419631, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 14204 + }, + { + "epoch": 0.14205, + "grad_norm": 1.072343905396286, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 14205 + }, + { + "epoch": 0.14206, + "grad_norm": 1.0918392136913837, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 14206 + }, + { + "epoch": 0.14207, + "grad_norm": 0.9079756520513552, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 14207 + }, + { + "epoch": 0.14208, + "grad_norm": 1.0456318462186016, + "learning_rate": 0.003, + "loss": 4.068, + "step": 14208 + }, + { + "epoch": 0.14209, + "grad_norm": 0.9103258186594584, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 14209 + }, + { + "epoch": 0.1421, + "grad_norm": 0.9195259346016901, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 14210 + }, + { + "epoch": 0.14211, + "grad_norm": 0.8677025003910135, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 14211 + }, + { + "epoch": 0.14212, + "grad_norm": 0.7573509185278031, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 14212 + }, + { + "epoch": 0.14213, + "grad_norm": 0.7953171501146511, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 14213 + }, + { + "epoch": 0.14214, + "grad_norm": 0.8858347423608933, + "learning_rate": 0.003, + "loss": 4.074, + "step": 14214 + }, + { + "epoch": 0.14215, + "grad_norm": 1.1031124908567087, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 14215 + }, + { + "epoch": 0.14216, + "grad_norm": 1.131079615414483, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 14216 + }, + { + "epoch": 0.14217, + "grad_norm": 0.8741853018247859, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 14217 + }, + { + "epoch": 0.14218, + "grad_norm": 0.7734197898654148, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 14218 + }, + { + "epoch": 0.14219, + "grad_norm": 0.8727939958879936, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 14219 + }, + { + "epoch": 0.1422, + "grad_norm": 0.888259714477054, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 14220 + }, + { + "epoch": 0.14221, + "grad_norm": 0.8408869079351118, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 14221 + }, + { + "epoch": 0.14222, + "grad_norm": 0.9160714951657148, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 14222 + }, + { + "epoch": 0.14223, + "grad_norm": 0.8923006023358101, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 14223 + }, + { + "epoch": 0.14224, + "grad_norm": 0.7541129078942927, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 14224 + }, + { + "epoch": 0.14225, + "grad_norm": 0.7017399774791405, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 14225 + }, + { + "epoch": 0.14226, + "grad_norm": 0.7175844800580948, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 14226 + }, + { + "epoch": 0.14227, + "grad_norm": 0.8061323182082613, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 14227 + }, + { + "epoch": 0.14228, + "grad_norm": 0.8549019882180127, + "learning_rate": 0.003, + "loss": 4.058, + "step": 14228 + }, + { + "epoch": 0.14229, + "grad_norm": 0.7618872119689732, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 14229 + }, + { + "epoch": 0.1423, + "grad_norm": 0.748617160877437, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 14230 + }, + { + "epoch": 0.14231, + "grad_norm": 0.8537449120706244, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 14231 + }, + { + "epoch": 0.14232, + "grad_norm": 0.9538841898644234, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 14232 + }, + { + "epoch": 0.14233, + "grad_norm": 1.2323470714454452, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 14233 + }, + { + "epoch": 0.14234, + "grad_norm": 0.9252380311555533, + "learning_rate": 0.003, + "loss": 4.054, + "step": 14234 + }, + { + "epoch": 0.14235, + "grad_norm": 0.8357527733858238, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 14235 + }, + { + "epoch": 0.14236, + "grad_norm": 0.7383299511457709, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 14236 + }, + { + "epoch": 0.14237, + "grad_norm": 0.7148127127044163, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 14237 + }, + { + "epoch": 0.14238, + "grad_norm": 0.6575998983357735, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 14238 + }, + { + "epoch": 0.14239, + "grad_norm": 0.7224088928415203, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 14239 + }, + { + "epoch": 0.1424, + "grad_norm": 0.7258406529257267, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 14240 + }, + { + "epoch": 0.14241, + "grad_norm": 0.6700010559315331, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 14241 + }, + { + "epoch": 0.14242, + "grad_norm": 0.7034472369150453, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 14242 + }, + { + "epoch": 0.14243, + "grad_norm": 0.9655966873187218, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 14243 + }, + { + "epoch": 0.14244, + "grad_norm": 1.3492244069241692, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 14244 + }, + { + "epoch": 0.14245, + "grad_norm": 0.787432760836881, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 14245 + }, + { + "epoch": 0.14246, + "grad_norm": 0.6589245508435077, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 14246 + }, + { + "epoch": 0.14247, + "grad_norm": 0.7176742670893751, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 14247 + }, + { + "epoch": 0.14248, + "grad_norm": 0.7004591529124673, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 14248 + }, + { + "epoch": 0.14249, + "grad_norm": 0.7872892677745946, + "learning_rate": 0.003, + "loss": 4.024, + "step": 14249 + }, + { + "epoch": 0.1425, + "grad_norm": 0.9122621574614087, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 14250 + }, + { + "epoch": 0.14251, + "grad_norm": 1.0861952597559, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 14251 + }, + { + "epoch": 0.14252, + "grad_norm": 0.8040420734689884, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 14252 + }, + { + "epoch": 0.14253, + "grad_norm": 0.673228848158347, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 14253 + }, + { + "epoch": 0.14254, + "grad_norm": 0.6178610509082365, + "learning_rate": 0.003, + "loss": 4.054, + "step": 14254 + }, + { + "epoch": 0.14255, + "grad_norm": 0.7803330205829802, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 14255 + }, + { + "epoch": 0.14256, + "grad_norm": 0.8507086918819735, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 14256 + }, + { + "epoch": 0.14257, + "grad_norm": 0.837260019958009, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 14257 + }, + { + "epoch": 0.14258, + "grad_norm": 0.9369770791697932, + "learning_rate": 0.003, + "loss": 4.054, + "step": 14258 + }, + { + "epoch": 0.14259, + "grad_norm": 1.1719760621145092, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 14259 + }, + { + "epoch": 0.1426, + "grad_norm": 1.0047106805433939, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 14260 + }, + { + "epoch": 0.14261, + "grad_norm": 1.0201426190948089, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 14261 + }, + { + "epoch": 0.14262, + "grad_norm": 0.8794796375528614, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 14262 + }, + { + "epoch": 0.14263, + "grad_norm": 0.8599179547945105, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 14263 + }, + { + "epoch": 0.14264, + "grad_norm": 0.853001400341203, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 14264 + }, + { + "epoch": 0.14265, + "grad_norm": 0.9796941134897704, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 14265 + }, + { + "epoch": 0.14266, + "grad_norm": 1.143809317255042, + "learning_rate": 0.003, + "loss": 4.098, + "step": 14266 + }, + { + "epoch": 0.14267, + "grad_norm": 1.0123655465496928, + "learning_rate": 0.003, + "loss": 4.096, + "step": 14267 + }, + { + "epoch": 0.14268, + "grad_norm": 1.156157055455667, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 14268 + }, + { + "epoch": 0.14269, + "grad_norm": 0.9417857169223783, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 14269 + }, + { + "epoch": 0.1427, + "grad_norm": 1.002625041902675, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 14270 + }, + { + "epoch": 0.14271, + "grad_norm": 1.1455257576736626, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 14271 + }, + { + "epoch": 0.14272, + "grad_norm": 0.9169999227937337, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 14272 + }, + { + "epoch": 0.14273, + "grad_norm": 1.0288608407310105, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 14273 + }, + { + "epoch": 0.14274, + "grad_norm": 0.9895127236305712, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 14274 + }, + { + "epoch": 0.14275, + "grad_norm": 1.024068444256658, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 14275 + }, + { + "epoch": 0.14276, + "grad_norm": 1.0358327615667164, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 14276 + }, + { + "epoch": 0.14277, + "grad_norm": 1.0995848603797826, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 14277 + }, + { + "epoch": 0.14278, + "grad_norm": 0.9710395555199974, + "learning_rate": 0.003, + "loss": 4.067, + "step": 14278 + }, + { + "epoch": 0.14279, + "grad_norm": 0.7935574068237154, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 14279 + }, + { + "epoch": 0.1428, + "grad_norm": 0.7440767417035967, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 14280 + }, + { + "epoch": 0.14281, + "grad_norm": 0.7249925095996536, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 14281 + }, + { + "epoch": 0.14282, + "grad_norm": 0.7738584216331064, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 14282 + }, + { + "epoch": 0.14283, + "grad_norm": 0.8075419416152382, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 14283 + }, + { + "epoch": 0.14284, + "grad_norm": 0.808656055655579, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 14284 + }, + { + "epoch": 0.14285, + "grad_norm": 0.7434415777020318, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 14285 + }, + { + "epoch": 0.14286, + "grad_norm": 0.782537850481025, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 14286 + }, + { + "epoch": 0.14287, + "grad_norm": 0.7398803590584546, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 14287 + }, + { + "epoch": 0.14288, + "grad_norm": 0.6498718731345486, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 14288 + }, + { + "epoch": 0.14289, + "grad_norm": 0.57167150621938, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 14289 + }, + { + "epoch": 0.1429, + "grad_norm": 0.629917071736773, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 14290 + }, + { + "epoch": 0.14291, + "grad_norm": 0.7282520539482328, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 14291 + }, + { + "epoch": 0.14292, + "grad_norm": 0.8448216364800304, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 14292 + }, + { + "epoch": 0.14293, + "grad_norm": 0.9898291474286086, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 14293 + }, + { + "epoch": 0.14294, + "grad_norm": 1.384130185668138, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 14294 + }, + { + "epoch": 0.14295, + "grad_norm": 0.8132712159924654, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 14295 + }, + { + "epoch": 0.14296, + "grad_norm": 0.5878901615618249, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 14296 + }, + { + "epoch": 0.14297, + "grad_norm": 0.5235229402678793, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 14297 + }, + { + "epoch": 0.14298, + "grad_norm": 0.5940769162928273, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 14298 + }, + { + "epoch": 0.14299, + "grad_norm": 0.635954759733811, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 14299 + }, + { + "epoch": 0.143, + "grad_norm": 0.7604566779202656, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 14300 + }, + { + "epoch": 0.14301, + "grad_norm": 0.9621812171223754, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 14301 + }, + { + "epoch": 0.14302, + "grad_norm": 1.1332426046276227, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 14302 + }, + { + "epoch": 0.14303, + "grad_norm": 0.8826723697906451, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 14303 + }, + { + "epoch": 0.14304, + "grad_norm": 0.8146302717388536, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 14304 + }, + { + "epoch": 0.14305, + "grad_norm": 0.8862814720311086, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 14305 + }, + { + "epoch": 0.14306, + "grad_norm": 1.052781435999485, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 14306 + }, + { + "epoch": 0.14307, + "grad_norm": 0.9466303101454425, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 14307 + }, + { + "epoch": 0.14308, + "grad_norm": 0.9569390726669585, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 14308 + }, + { + "epoch": 0.14309, + "grad_norm": 0.9767673438310612, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 14309 + }, + { + "epoch": 0.1431, + "grad_norm": 1.2068390135277705, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 14310 + }, + { + "epoch": 0.14311, + "grad_norm": 0.9118515998023901, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 14311 + }, + { + "epoch": 0.14312, + "grad_norm": 0.9234441036395075, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 14312 + }, + { + "epoch": 0.14313, + "grad_norm": 0.9379283578197763, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 14313 + }, + { + "epoch": 0.14314, + "grad_norm": 1.0481055206862089, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 14314 + }, + { + "epoch": 0.14315, + "grad_norm": 1.0170438017287065, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 14315 + }, + { + "epoch": 0.14316, + "grad_norm": 1.1314036767196796, + "learning_rate": 0.003, + "loss": 4.075, + "step": 14316 + }, + { + "epoch": 0.14317, + "grad_norm": 0.9667708712613111, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 14317 + }, + { + "epoch": 0.14318, + "grad_norm": 1.0540721435901028, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 14318 + }, + { + "epoch": 0.14319, + "grad_norm": 0.8125763824735159, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 14319 + }, + { + "epoch": 0.1432, + "grad_norm": 0.6690389583956455, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 14320 + }, + { + "epoch": 0.14321, + "grad_norm": 0.715576186711564, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 14321 + }, + { + "epoch": 0.14322, + "grad_norm": 0.7034013270653181, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 14322 + }, + { + "epoch": 0.14323, + "grad_norm": 0.6650656285716945, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 14323 + }, + { + "epoch": 0.14324, + "grad_norm": 0.8507560154157894, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 14324 + }, + { + "epoch": 0.14325, + "grad_norm": 0.893132375990626, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 14325 + }, + { + "epoch": 0.14326, + "grad_norm": 0.8316403162519251, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 14326 + }, + { + "epoch": 0.14327, + "grad_norm": 0.8211797982823278, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 14327 + }, + { + "epoch": 0.14328, + "grad_norm": 0.854730121681877, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 14328 + }, + { + "epoch": 0.14329, + "grad_norm": 0.8488314778432745, + "learning_rate": 0.003, + "loss": 4.061, + "step": 14329 + }, + { + "epoch": 0.1433, + "grad_norm": 0.8173694929630654, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 14330 + }, + { + "epoch": 0.14331, + "grad_norm": 0.8451422403651959, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 14331 + }, + { + "epoch": 0.14332, + "grad_norm": 0.953725488533161, + "learning_rate": 0.003, + "loss": 4.071, + "step": 14332 + }, + { + "epoch": 0.14333, + "grad_norm": 0.9935817251768986, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 14333 + }, + { + "epoch": 0.14334, + "grad_norm": 1.0069330550136053, + "learning_rate": 0.003, + "loss": 4.077, + "step": 14334 + }, + { + "epoch": 0.14335, + "grad_norm": 1.0128321819662722, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 14335 + }, + { + "epoch": 0.14336, + "grad_norm": 0.9720243567033409, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 14336 + }, + { + "epoch": 0.14337, + "grad_norm": 1.034406502119741, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 14337 + }, + { + "epoch": 0.14338, + "grad_norm": 1.028489631883045, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 14338 + }, + { + "epoch": 0.14339, + "grad_norm": 1.0164498014126644, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 14339 + }, + { + "epoch": 0.1434, + "grad_norm": 0.9007186917592227, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 14340 + }, + { + "epoch": 0.14341, + "grad_norm": 0.9064557694182467, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 14341 + }, + { + "epoch": 0.14342, + "grad_norm": 0.9527853939982664, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 14342 + }, + { + "epoch": 0.14343, + "grad_norm": 0.8255592350256148, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 14343 + }, + { + "epoch": 0.14344, + "grad_norm": 0.6414147580958486, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 14344 + }, + { + "epoch": 0.14345, + "grad_norm": 0.6873869318678593, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 14345 + }, + { + "epoch": 0.14346, + "grad_norm": 0.7330891013138953, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 14346 + }, + { + "epoch": 0.14347, + "grad_norm": 0.8101567143121826, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 14347 + }, + { + "epoch": 0.14348, + "grad_norm": 0.9702199997855077, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 14348 + }, + { + "epoch": 0.14349, + "grad_norm": 0.9904100138960906, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 14349 + }, + { + "epoch": 0.1435, + "grad_norm": 1.1019773919500955, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 14350 + }, + { + "epoch": 0.14351, + "grad_norm": 0.9978511333891973, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 14351 + }, + { + "epoch": 0.14352, + "grad_norm": 1.014835951520754, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 14352 + }, + { + "epoch": 0.14353, + "grad_norm": 0.9486519931027005, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 14353 + }, + { + "epoch": 0.14354, + "grad_norm": 0.8014608748984109, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 14354 + }, + { + "epoch": 0.14355, + "grad_norm": 0.6893169107280758, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 14355 + }, + { + "epoch": 0.14356, + "grad_norm": 0.7790746564890838, + "learning_rate": 0.003, + "loss": 4.068, + "step": 14356 + }, + { + "epoch": 0.14357, + "grad_norm": 0.8571150262837032, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 14357 + }, + { + "epoch": 0.14358, + "grad_norm": 0.9166295189774966, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 14358 + }, + { + "epoch": 0.14359, + "grad_norm": 0.9473566514431786, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 14359 + }, + { + "epoch": 0.1436, + "grad_norm": 0.9837585839604661, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 14360 + }, + { + "epoch": 0.14361, + "grad_norm": 1.0196945016539802, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 14361 + }, + { + "epoch": 0.14362, + "grad_norm": 1.052706176676344, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 14362 + }, + { + "epoch": 0.14363, + "grad_norm": 1.0314646065600368, + "learning_rate": 0.003, + "loss": 4.07, + "step": 14363 + }, + { + "epoch": 0.14364, + "grad_norm": 0.8482256306999743, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 14364 + }, + { + "epoch": 0.14365, + "grad_norm": 0.8388761850218024, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 14365 + }, + { + "epoch": 0.14366, + "grad_norm": 0.7663169047495818, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 14366 + }, + { + "epoch": 0.14367, + "grad_norm": 0.6499443201499312, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 14367 + }, + { + "epoch": 0.14368, + "grad_norm": 0.7728910831473662, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 14368 + }, + { + "epoch": 0.14369, + "grad_norm": 0.9151885531778485, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 14369 + }, + { + "epoch": 0.1437, + "grad_norm": 1.012549276988548, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 14370 + }, + { + "epoch": 0.14371, + "grad_norm": 1.0316331881002632, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 14371 + }, + { + "epoch": 0.14372, + "grad_norm": 0.9309894338228544, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 14372 + }, + { + "epoch": 0.14373, + "grad_norm": 0.8719919389914711, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 14373 + }, + { + "epoch": 0.14374, + "grad_norm": 0.8183268003383068, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 14374 + }, + { + "epoch": 0.14375, + "grad_norm": 0.822230156604052, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 14375 + }, + { + "epoch": 0.14376, + "grad_norm": 0.8210409550089975, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 14376 + }, + { + "epoch": 0.14377, + "grad_norm": 0.7918484286405145, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 14377 + }, + { + "epoch": 0.14378, + "grad_norm": 0.6871245341077445, + "learning_rate": 0.003, + "loss": 4.059, + "step": 14378 + }, + { + "epoch": 0.14379, + "grad_norm": 0.61337738208206, + "learning_rate": 0.003, + "loss": 4.062, + "step": 14379 + }, + { + "epoch": 0.1438, + "grad_norm": 0.7949733746083621, + "learning_rate": 0.003, + "loss": 4.061, + "step": 14380 + }, + { + "epoch": 0.14381, + "grad_norm": 0.7966095482585109, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 14381 + }, + { + "epoch": 0.14382, + "grad_norm": 0.742684410456428, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 14382 + }, + { + "epoch": 0.14383, + "grad_norm": 0.6808568373631283, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 14383 + }, + { + "epoch": 0.14384, + "grad_norm": 0.6144707846148258, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 14384 + }, + { + "epoch": 0.14385, + "grad_norm": 0.6397262884987464, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 14385 + }, + { + "epoch": 0.14386, + "grad_norm": 0.9079018023080869, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 14386 + }, + { + "epoch": 0.14387, + "grad_norm": 1.276655150279495, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 14387 + }, + { + "epoch": 0.14388, + "grad_norm": 0.8802185657108448, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 14388 + }, + { + "epoch": 0.14389, + "grad_norm": 0.7748540022902647, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 14389 + }, + { + "epoch": 0.1439, + "grad_norm": 0.7910642923199699, + "learning_rate": 0.003, + "loss": 4.066, + "step": 14390 + }, + { + "epoch": 0.14391, + "grad_norm": 0.7814914376937365, + "learning_rate": 0.003, + "loss": 4.054, + "step": 14391 + }, + { + "epoch": 0.14392, + "grad_norm": 0.711033803024645, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 14392 + }, + { + "epoch": 0.14393, + "grad_norm": 0.6689814772603192, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 14393 + }, + { + "epoch": 0.14394, + "grad_norm": 0.7743410260570905, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 14394 + }, + { + "epoch": 0.14395, + "grad_norm": 0.8267055962453908, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 14395 + }, + { + "epoch": 0.14396, + "grad_norm": 0.8280054805253843, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 14396 + }, + { + "epoch": 0.14397, + "grad_norm": 0.8410900205580703, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 14397 + }, + { + "epoch": 0.14398, + "grad_norm": 0.74892299183487, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 14398 + }, + { + "epoch": 0.14399, + "grad_norm": 0.7541148933857669, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 14399 + }, + { + "epoch": 0.144, + "grad_norm": 0.7203519769604699, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 14400 + }, + { + "epoch": 0.14401, + "grad_norm": 0.7863202506879009, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 14401 + }, + { + "epoch": 0.14402, + "grad_norm": 0.8304267299415876, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 14402 + }, + { + "epoch": 0.14403, + "grad_norm": 1.0910369478178739, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 14403 + }, + { + "epoch": 0.14404, + "grad_norm": 1.1771650605747097, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 14404 + }, + { + "epoch": 0.14405, + "grad_norm": 0.892076820447087, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 14405 + }, + { + "epoch": 0.14406, + "grad_norm": 0.8772684958664766, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 14406 + }, + { + "epoch": 0.14407, + "grad_norm": 0.9973513358644183, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 14407 + }, + { + "epoch": 0.14408, + "grad_norm": 1.1490641624873017, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 14408 + }, + { + "epoch": 0.14409, + "grad_norm": 0.815542294684307, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 14409 + }, + { + "epoch": 0.1441, + "grad_norm": 0.8081892759281243, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 14410 + }, + { + "epoch": 0.14411, + "grad_norm": 0.9192651921903972, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 14411 + }, + { + "epoch": 0.14412, + "grad_norm": 0.8645285669631384, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 14412 + }, + { + "epoch": 0.14413, + "grad_norm": 0.8281700030855166, + "learning_rate": 0.003, + "loss": 4.084, + "step": 14413 + }, + { + "epoch": 0.14414, + "grad_norm": 1.0239009006094857, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 14414 + }, + { + "epoch": 0.14415, + "grad_norm": 1.0868552660199027, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 14415 + }, + { + "epoch": 0.14416, + "grad_norm": 1.0161286840519521, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 14416 + }, + { + "epoch": 0.14417, + "grad_norm": 1.174833107721702, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 14417 + }, + { + "epoch": 0.14418, + "grad_norm": 1.091906753190101, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 14418 + }, + { + "epoch": 0.14419, + "grad_norm": 0.9390134669254883, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 14419 + }, + { + "epoch": 0.1442, + "grad_norm": 0.905316492652339, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 14420 + }, + { + "epoch": 0.14421, + "grad_norm": 0.8973630440941875, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 14421 + }, + { + "epoch": 0.14422, + "grad_norm": 0.8593800792454128, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 14422 + }, + { + "epoch": 0.14423, + "grad_norm": 0.9097836307265622, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 14423 + }, + { + "epoch": 0.14424, + "grad_norm": 0.8769136095747978, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 14424 + }, + { + "epoch": 0.14425, + "grad_norm": 0.8781004531945429, + "learning_rate": 0.003, + "loss": 4.052, + "step": 14425 + }, + { + "epoch": 0.14426, + "grad_norm": 0.8523510008630767, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 14426 + }, + { + "epoch": 0.14427, + "grad_norm": 0.7697966118866878, + "learning_rate": 0.003, + "loss": 4.08, + "step": 14427 + }, + { + "epoch": 0.14428, + "grad_norm": 0.6659967765645805, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 14428 + }, + { + "epoch": 0.14429, + "grad_norm": 0.7283987451534251, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 14429 + }, + { + "epoch": 0.1443, + "grad_norm": 0.6524668161944647, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 14430 + }, + { + "epoch": 0.14431, + "grad_norm": 0.6577762341755574, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 14431 + }, + { + "epoch": 0.14432, + "grad_norm": 0.7670158304520253, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 14432 + }, + { + "epoch": 0.14433, + "grad_norm": 0.8497958660369926, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 14433 + }, + { + "epoch": 0.14434, + "grad_norm": 1.0063921227645782, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 14434 + }, + { + "epoch": 0.14435, + "grad_norm": 1.232546830404364, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 14435 + }, + { + "epoch": 0.14436, + "grad_norm": 0.7803306088988813, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 14436 + }, + { + "epoch": 0.14437, + "grad_norm": 0.6385659966701883, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 14437 + }, + { + "epoch": 0.14438, + "grad_norm": 0.7387603247427053, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 14438 + }, + { + "epoch": 0.14439, + "grad_norm": 0.8775053705168816, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 14439 + }, + { + "epoch": 0.1444, + "grad_norm": 0.9734258283263654, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 14440 + }, + { + "epoch": 0.14441, + "grad_norm": 0.9705028390071423, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 14441 + }, + { + "epoch": 0.14442, + "grad_norm": 0.9960062972690155, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 14442 + }, + { + "epoch": 0.14443, + "grad_norm": 0.8818896239814513, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 14443 + }, + { + "epoch": 0.14444, + "grad_norm": 0.7289286765268309, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 14444 + }, + { + "epoch": 0.14445, + "grad_norm": 0.7740432225462396, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 14445 + }, + { + "epoch": 0.14446, + "grad_norm": 0.9291240503602122, + "learning_rate": 0.003, + "loss": 4.057, + "step": 14446 + }, + { + "epoch": 0.14447, + "grad_norm": 1.1489522686322768, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 14447 + }, + { + "epoch": 0.14448, + "grad_norm": 0.9043742838406463, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 14448 + }, + { + "epoch": 0.14449, + "grad_norm": 0.8886525100096326, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 14449 + }, + { + "epoch": 0.1445, + "grad_norm": 0.9442135755576159, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 14450 + }, + { + "epoch": 0.14451, + "grad_norm": 0.88853199038061, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 14451 + }, + { + "epoch": 0.14452, + "grad_norm": 0.9187092218973602, + "learning_rate": 0.003, + "loss": 4.066, + "step": 14452 + }, + { + "epoch": 0.14453, + "grad_norm": 0.8451833134059921, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 14453 + }, + { + "epoch": 0.14454, + "grad_norm": 0.8513118210882025, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 14454 + }, + { + "epoch": 0.14455, + "grad_norm": 0.7722749130937169, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 14455 + }, + { + "epoch": 0.14456, + "grad_norm": 0.7169890500245386, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 14456 + }, + { + "epoch": 0.14457, + "grad_norm": 0.6697986292702363, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 14457 + }, + { + "epoch": 0.14458, + "grad_norm": 0.7755031570144965, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 14458 + }, + { + "epoch": 0.14459, + "grad_norm": 1.0605824232348093, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 14459 + }, + { + "epoch": 0.1446, + "grad_norm": 1.2293405457221496, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 14460 + }, + { + "epoch": 0.14461, + "grad_norm": 0.7976780762548757, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 14461 + }, + { + "epoch": 0.14462, + "grad_norm": 0.7849320724391274, + "learning_rate": 0.003, + "loss": 4.064, + "step": 14462 + }, + { + "epoch": 0.14463, + "grad_norm": 0.8266960310682302, + "learning_rate": 0.003, + "loss": 4.083, + "step": 14463 + }, + { + "epoch": 0.14464, + "grad_norm": 0.7460753583256715, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 14464 + }, + { + "epoch": 0.14465, + "grad_norm": 0.7821210362976887, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 14465 + }, + { + "epoch": 0.14466, + "grad_norm": 0.8691872038026347, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 14466 + }, + { + "epoch": 0.14467, + "grad_norm": 0.8688669781328593, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 14467 + }, + { + "epoch": 0.14468, + "grad_norm": 1.0157383621401073, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 14468 + }, + { + "epoch": 0.14469, + "grad_norm": 1.0488500564055532, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 14469 + }, + { + "epoch": 0.1447, + "grad_norm": 0.8453306124392075, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 14470 + }, + { + "epoch": 0.14471, + "grad_norm": 0.7496859708795902, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 14471 + }, + { + "epoch": 0.14472, + "grad_norm": 0.7702865369170115, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 14472 + }, + { + "epoch": 0.14473, + "grad_norm": 0.7927025686268351, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 14473 + }, + { + "epoch": 0.14474, + "grad_norm": 0.7028001411167409, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 14474 + }, + { + "epoch": 0.14475, + "grad_norm": 0.6366764775222469, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 14475 + }, + { + "epoch": 0.14476, + "grad_norm": 0.7266403812115433, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 14476 + }, + { + "epoch": 0.14477, + "grad_norm": 0.8285669818610948, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 14477 + }, + { + "epoch": 0.14478, + "grad_norm": 1.0689441553197612, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 14478 + }, + { + "epoch": 0.14479, + "grad_norm": 1.2619968094331953, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 14479 + }, + { + "epoch": 0.1448, + "grad_norm": 0.681802879160255, + "learning_rate": 0.003, + "loss": 4.059, + "step": 14480 + }, + { + "epoch": 0.14481, + "grad_norm": 0.7895871815518442, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 14481 + }, + { + "epoch": 0.14482, + "grad_norm": 0.7162699951289682, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 14482 + }, + { + "epoch": 0.14483, + "grad_norm": 0.7559369806480447, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 14483 + }, + { + "epoch": 0.14484, + "grad_norm": 0.6854003499725579, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 14484 + }, + { + "epoch": 0.14485, + "grad_norm": 0.7261168346272312, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 14485 + }, + { + "epoch": 0.14486, + "grad_norm": 0.7581916674639567, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 14486 + }, + { + "epoch": 0.14487, + "grad_norm": 0.8458017056734358, + "learning_rate": 0.003, + "loss": 4.058, + "step": 14487 + }, + { + "epoch": 0.14488, + "grad_norm": 0.9407237060809456, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 14488 + }, + { + "epoch": 0.14489, + "grad_norm": 1.2838500110985553, + "learning_rate": 0.003, + "loss": 4.068, + "step": 14489 + }, + { + "epoch": 0.1449, + "grad_norm": 0.8253643573774834, + "learning_rate": 0.003, + "loss": 4.053, + "step": 14490 + }, + { + "epoch": 0.14491, + "grad_norm": 0.6833212631082725, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 14491 + }, + { + "epoch": 0.14492, + "grad_norm": 0.5907013243321121, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 14492 + }, + { + "epoch": 0.14493, + "grad_norm": 0.5477081995181327, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 14493 + }, + { + "epoch": 0.14494, + "grad_norm": 0.6138069260250624, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 14494 + }, + { + "epoch": 0.14495, + "grad_norm": 0.6810644475113934, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 14495 + }, + { + "epoch": 0.14496, + "grad_norm": 0.846006442349366, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 14496 + }, + { + "epoch": 0.14497, + "grad_norm": 0.9759374354820222, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 14497 + }, + { + "epoch": 0.14498, + "grad_norm": 1.1631680204402455, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 14498 + }, + { + "epoch": 0.14499, + "grad_norm": 0.8528800368239491, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 14499 + }, + { + "epoch": 0.145, + "grad_norm": 0.8007628472017968, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 14500 + }, + { + "epoch": 0.14501, + "grad_norm": 0.8621911835551398, + "learning_rate": 0.003, + "loss": 4.079, + "step": 14501 + }, + { + "epoch": 0.14502, + "grad_norm": 0.8392413556988816, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 14502 + }, + { + "epoch": 0.14503, + "grad_norm": 0.9062842404881979, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 14503 + }, + { + "epoch": 0.14504, + "grad_norm": 1.0475072658786253, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 14504 + }, + { + "epoch": 0.14505, + "grad_norm": 1.0771198598753233, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 14505 + }, + { + "epoch": 0.14506, + "grad_norm": 0.8522010187012606, + "learning_rate": 0.003, + "loss": 4.056, + "step": 14506 + }, + { + "epoch": 0.14507, + "grad_norm": 0.9266244073098737, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 14507 + }, + { + "epoch": 0.14508, + "grad_norm": 1.0972487087219982, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 14508 + }, + { + "epoch": 0.14509, + "grad_norm": 1.1536680843828024, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 14509 + }, + { + "epoch": 0.1451, + "grad_norm": 1.0259378881237784, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 14510 + }, + { + "epoch": 0.14511, + "grad_norm": 1.0730596033394761, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 14511 + }, + { + "epoch": 0.14512, + "grad_norm": 0.9411542390444215, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 14512 + }, + { + "epoch": 0.14513, + "grad_norm": 0.8994694044524201, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 14513 + }, + { + "epoch": 0.14514, + "grad_norm": 0.8067485770896665, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 14514 + }, + { + "epoch": 0.14515, + "grad_norm": 1.01746158851945, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 14515 + }, + { + "epoch": 0.14516, + "grad_norm": 1.2153894499015556, + "learning_rate": 0.003, + "loss": 4.11, + "step": 14516 + }, + { + "epoch": 0.14517, + "grad_norm": 0.9040870044826432, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 14517 + }, + { + "epoch": 0.14518, + "grad_norm": 0.9696100920898809, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 14518 + }, + { + "epoch": 0.14519, + "grad_norm": 1.0582262440718013, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 14519 + }, + { + "epoch": 0.1452, + "grad_norm": 0.9543756723548299, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 14520 + }, + { + "epoch": 0.14521, + "grad_norm": 0.899627836617662, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 14521 + }, + { + "epoch": 0.14522, + "grad_norm": 0.7550291121578049, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 14522 + }, + { + "epoch": 0.14523, + "grad_norm": 0.7813326825471762, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 14523 + }, + { + "epoch": 0.14524, + "grad_norm": 0.694317802060896, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 14524 + }, + { + "epoch": 0.14525, + "grad_norm": 0.6915796088984916, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 14525 + }, + { + "epoch": 0.14526, + "grad_norm": 0.7253829223209967, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 14526 + }, + { + "epoch": 0.14527, + "grad_norm": 0.8022210925623505, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 14527 + }, + { + "epoch": 0.14528, + "grad_norm": 0.9031507779943743, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 14528 + }, + { + "epoch": 0.14529, + "grad_norm": 0.9972110549426437, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 14529 + }, + { + "epoch": 0.1453, + "grad_norm": 1.0832360084774422, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 14530 + }, + { + "epoch": 0.14531, + "grad_norm": 1.105797035647914, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 14531 + }, + { + "epoch": 0.14532, + "grad_norm": 0.786615455076114, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 14532 + }, + { + "epoch": 0.14533, + "grad_norm": 0.7082786807676374, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 14533 + }, + { + "epoch": 0.14534, + "grad_norm": 0.8112960746828972, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 14534 + }, + { + "epoch": 0.14535, + "grad_norm": 0.9063381907497825, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 14535 + }, + { + "epoch": 0.14536, + "grad_norm": 0.8144238186101873, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 14536 + }, + { + "epoch": 0.14537, + "grad_norm": 0.7247677066233076, + "learning_rate": 0.003, + "loss": 4.05, + "step": 14537 + }, + { + "epoch": 0.14538, + "grad_norm": 0.703189835688407, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 14538 + }, + { + "epoch": 0.14539, + "grad_norm": 0.7847737719060214, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 14539 + }, + { + "epoch": 0.1454, + "grad_norm": 0.8689603553966835, + "learning_rate": 0.003, + "loss": 4.073, + "step": 14540 + }, + { + "epoch": 0.14541, + "grad_norm": 1.078177458358583, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 14541 + }, + { + "epoch": 0.14542, + "grad_norm": 1.0968398742626888, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 14542 + }, + { + "epoch": 0.14543, + "grad_norm": 1.057736346844444, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 14543 + }, + { + "epoch": 0.14544, + "grad_norm": 1.0096394014447299, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 14544 + }, + { + "epoch": 0.14545, + "grad_norm": 0.9595567075326513, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 14545 + }, + { + "epoch": 0.14546, + "grad_norm": 1.0237956273787805, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 14546 + }, + { + "epoch": 0.14547, + "grad_norm": 0.9296569813294112, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 14547 + }, + { + "epoch": 0.14548, + "grad_norm": 0.8580821455270444, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 14548 + }, + { + "epoch": 0.14549, + "grad_norm": 0.872113276229092, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 14549 + }, + { + "epoch": 0.1455, + "grad_norm": 0.977749134256614, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 14550 + }, + { + "epoch": 0.14551, + "grad_norm": 0.9399304829514836, + "learning_rate": 0.003, + "loss": 4.085, + "step": 14551 + }, + { + "epoch": 0.14552, + "grad_norm": 0.8663647607782625, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 14552 + }, + { + "epoch": 0.14553, + "grad_norm": 0.9136647582260118, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 14553 + }, + { + "epoch": 0.14554, + "grad_norm": 0.7399310086217266, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 14554 + }, + { + "epoch": 0.14555, + "grad_norm": 0.6743750311640493, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 14555 + }, + { + "epoch": 0.14556, + "grad_norm": 0.7830085187202371, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 14556 + }, + { + "epoch": 0.14557, + "grad_norm": 0.8716227519681807, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 14557 + }, + { + "epoch": 0.14558, + "grad_norm": 1.0059827438311395, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 14558 + }, + { + "epoch": 0.14559, + "grad_norm": 1.3009668224997837, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 14559 + }, + { + "epoch": 0.1456, + "grad_norm": 0.776695489158605, + "learning_rate": 0.003, + "loss": 4.056, + "step": 14560 + }, + { + "epoch": 0.14561, + "grad_norm": 0.6180285588487668, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 14561 + }, + { + "epoch": 0.14562, + "grad_norm": 0.7558658696472682, + "learning_rate": 0.003, + "loss": 4.064, + "step": 14562 + }, + { + "epoch": 0.14563, + "grad_norm": 0.8641124822104197, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 14563 + }, + { + "epoch": 0.14564, + "grad_norm": 0.9708162293916572, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 14564 + }, + { + "epoch": 0.14565, + "grad_norm": 0.9456146169123835, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 14565 + }, + { + "epoch": 0.14566, + "grad_norm": 0.8643872779025452, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 14566 + }, + { + "epoch": 0.14567, + "grad_norm": 0.8171882167001808, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 14567 + }, + { + "epoch": 0.14568, + "grad_norm": 0.8279861681618138, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 14568 + }, + { + "epoch": 0.14569, + "grad_norm": 0.6954121162267392, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 14569 + }, + { + "epoch": 0.1457, + "grad_norm": 0.6350707905293368, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 14570 + }, + { + "epoch": 0.14571, + "grad_norm": 0.6791557320846004, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 14571 + }, + { + "epoch": 0.14572, + "grad_norm": 0.8016430771828299, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 14572 + }, + { + "epoch": 0.14573, + "grad_norm": 1.0264268295157744, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 14573 + }, + { + "epoch": 0.14574, + "grad_norm": 1.1955488793948061, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 14574 + }, + { + "epoch": 0.14575, + "grad_norm": 0.7119546441286688, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 14575 + }, + { + "epoch": 0.14576, + "grad_norm": 0.714543676774026, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 14576 + }, + { + "epoch": 0.14577, + "grad_norm": 0.6603251991861709, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 14577 + }, + { + "epoch": 0.14578, + "grad_norm": 0.7986259057033251, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 14578 + }, + { + "epoch": 0.14579, + "grad_norm": 0.9782640365302716, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 14579 + }, + { + "epoch": 0.1458, + "grad_norm": 1.1327755080474855, + "learning_rate": 0.003, + "loss": 4.079, + "step": 14580 + }, + { + "epoch": 0.14581, + "grad_norm": 0.9360040545332118, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 14581 + }, + { + "epoch": 0.14582, + "grad_norm": 0.8794183719156582, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 14582 + }, + { + "epoch": 0.14583, + "grad_norm": 0.8116502083563901, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 14583 + }, + { + "epoch": 0.14584, + "grad_norm": 0.7997145982234645, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 14584 + }, + { + "epoch": 0.14585, + "grad_norm": 0.8783829350835252, + "learning_rate": 0.003, + "loss": 4.083, + "step": 14585 + }, + { + "epoch": 0.14586, + "grad_norm": 0.868254154872957, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 14586 + }, + { + "epoch": 0.14587, + "grad_norm": 0.8102510840724461, + "learning_rate": 0.003, + "loss": 4.074, + "step": 14587 + }, + { + "epoch": 0.14588, + "grad_norm": 0.76591626897998, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 14588 + }, + { + "epoch": 0.14589, + "grad_norm": 0.8506288982616398, + "learning_rate": 0.003, + "loss": 4.032, + "step": 14589 + }, + { + "epoch": 0.1459, + "grad_norm": 0.839280358810672, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 14590 + }, + { + "epoch": 0.14591, + "grad_norm": 0.8624119454127883, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 14591 + }, + { + "epoch": 0.14592, + "grad_norm": 0.8135901322358887, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 14592 + }, + { + "epoch": 0.14593, + "grad_norm": 0.8970900221202972, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 14593 + }, + { + "epoch": 0.14594, + "grad_norm": 1.073051140183195, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 14594 + }, + { + "epoch": 0.14595, + "grad_norm": 1.143921867376063, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 14595 + }, + { + "epoch": 0.14596, + "grad_norm": 0.9751246681564357, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 14596 + }, + { + "epoch": 0.14597, + "grad_norm": 0.9558439609153476, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 14597 + }, + { + "epoch": 0.14598, + "grad_norm": 0.8489551978752373, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 14598 + }, + { + "epoch": 0.14599, + "grad_norm": 0.7378453945629538, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 14599 + }, + { + "epoch": 0.146, + "grad_norm": 0.7620338281678484, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 14600 + }, + { + "epoch": 0.14601, + "grad_norm": 0.7872528072739438, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 14601 + }, + { + "epoch": 0.14602, + "grad_norm": 0.8283077690042281, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 14602 + }, + { + "epoch": 0.14603, + "grad_norm": 0.8875270533641183, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 14603 + }, + { + "epoch": 0.14604, + "grad_norm": 0.9602478703917192, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 14604 + }, + { + "epoch": 0.14605, + "grad_norm": 0.9356670106534075, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 14605 + }, + { + "epoch": 0.14606, + "grad_norm": 0.9591106551999335, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 14606 + }, + { + "epoch": 0.14607, + "grad_norm": 1.0597266076352092, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 14607 + }, + { + "epoch": 0.14608, + "grad_norm": 1.0148851676702153, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 14608 + }, + { + "epoch": 0.14609, + "grad_norm": 0.9344274923235639, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 14609 + }, + { + "epoch": 0.1461, + "grad_norm": 0.7351046879113694, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 14610 + }, + { + "epoch": 0.14611, + "grad_norm": 0.6541584505103268, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 14611 + }, + { + "epoch": 0.14612, + "grad_norm": 0.7163037137339126, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 14612 + }, + { + "epoch": 0.14613, + "grad_norm": 0.8797493890159305, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 14613 + }, + { + "epoch": 0.14614, + "grad_norm": 1.1123619922511716, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 14614 + }, + { + "epoch": 0.14615, + "grad_norm": 0.931610247635008, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 14615 + }, + { + "epoch": 0.14616, + "grad_norm": 0.8743518829454591, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 14616 + }, + { + "epoch": 0.14617, + "grad_norm": 0.9004877603438717, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 14617 + }, + { + "epoch": 0.14618, + "grad_norm": 0.968993029689825, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 14618 + }, + { + "epoch": 0.14619, + "grad_norm": 1.0270859579787186, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 14619 + }, + { + "epoch": 0.1462, + "grad_norm": 1.045345147298914, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 14620 + }, + { + "epoch": 0.14621, + "grad_norm": 0.8978367997055376, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 14621 + }, + { + "epoch": 0.14622, + "grad_norm": 0.8714211223153865, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 14622 + }, + { + "epoch": 0.14623, + "grad_norm": 0.853544821928365, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 14623 + }, + { + "epoch": 0.14624, + "grad_norm": 0.9256017866669742, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 14624 + }, + { + "epoch": 0.14625, + "grad_norm": 1.1011178026911914, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 14625 + }, + { + "epoch": 0.14626, + "grad_norm": 0.9988850699587214, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 14626 + }, + { + "epoch": 0.14627, + "grad_norm": 0.849954492700033, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 14627 + }, + { + "epoch": 0.14628, + "grad_norm": 0.9278147679571234, + "learning_rate": 0.003, + "loss": 4.095, + "step": 14628 + }, + { + "epoch": 0.14629, + "grad_norm": 1.1390529378052798, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 14629 + }, + { + "epoch": 0.1463, + "grad_norm": 0.94019973826379, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 14630 + }, + { + "epoch": 0.14631, + "grad_norm": 0.9850923102058784, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 14631 + }, + { + "epoch": 0.14632, + "grad_norm": 0.8180294607237206, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 14632 + }, + { + "epoch": 0.14633, + "grad_norm": 0.7238626279935578, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 14633 + }, + { + "epoch": 0.14634, + "grad_norm": 0.7124310769930774, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 14634 + }, + { + "epoch": 0.14635, + "grad_norm": 0.7720391825748518, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 14635 + }, + { + "epoch": 0.14636, + "grad_norm": 0.7720467057553294, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 14636 + }, + { + "epoch": 0.14637, + "grad_norm": 0.7765657830166095, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 14637 + }, + { + "epoch": 0.14638, + "grad_norm": 0.752024170890064, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 14638 + }, + { + "epoch": 0.14639, + "grad_norm": 0.7137448304298398, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 14639 + }, + { + "epoch": 0.1464, + "grad_norm": 0.9394829416126098, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 14640 + }, + { + "epoch": 0.14641, + "grad_norm": 1.2614752983356443, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 14641 + }, + { + "epoch": 0.14642, + "grad_norm": 0.8767862521104057, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 14642 + }, + { + "epoch": 0.14643, + "grad_norm": 0.7207122124663736, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 14643 + }, + { + "epoch": 0.14644, + "grad_norm": 0.6281009563715171, + "learning_rate": 0.003, + "loss": 4.029, + "step": 14644 + }, + { + "epoch": 0.14645, + "grad_norm": 0.5840767945666202, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 14645 + }, + { + "epoch": 0.14646, + "grad_norm": 0.5869076032495534, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 14646 + }, + { + "epoch": 0.14647, + "grad_norm": 0.5245143418956627, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 14647 + }, + { + "epoch": 0.14648, + "grad_norm": 0.5789115806082246, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 14648 + }, + { + "epoch": 0.14649, + "grad_norm": 0.6069099859606367, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 14649 + }, + { + "epoch": 0.1465, + "grad_norm": 0.59759498127923, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 14650 + }, + { + "epoch": 0.14651, + "grad_norm": 0.5684717975111642, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 14651 + }, + { + "epoch": 0.14652, + "grad_norm": 0.5627215808491731, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 14652 + }, + { + "epoch": 0.14653, + "grad_norm": 0.6655467461551765, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 14653 + }, + { + "epoch": 0.14654, + "grad_norm": 0.7313967000135548, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 14654 + }, + { + "epoch": 0.14655, + "grad_norm": 0.7716177143225461, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 14655 + }, + { + "epoch": 0.14656, + "grad_norm": 0.7932886713813072, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 14656 + }, + { + "epoch": 0.14657, + "grad_norm": 1.048857394901038, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 14657 + }, + { + "epoch": 0.14658, + "grad_norm": 1.3762766534991493, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 14658 + }, + { + "epoch": 0.14659, + "grad_norm": 0.7287529165877537, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 14659 + }, + { + "epoch": 0.1466, + "grad_norm": 0.7813755703515315, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 14660 + }, + { + "epoch": 0.14661, + "grad_norm": 0.8299047236815597, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 14661 + }, + { + "epoch": 0.14662, + "grad_norm": 0.9433792544809567, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 14662 + }, + { + "epoch": 0.14663, + "grad_norm": 1.102204855802617, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 14663 + }, + { + "epoch": 0.14664, + "grad_norm": 1.036955760055929, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 14664 + }, + { + "epoch": 0.14665, + "grad_norm": 1.095144185653256, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 14665 + }, + { + "epoch": 0.14666, + "grad_norm": 0.9939603749475371, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 14666 + }, + { + "epoch": 0.14667, + "grad_norm": 1.0756469695185484, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 14667 + }, + { + "epoch": 0.14668, + "grad_norm": 1.001666873680944, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 14668 + }, + { + "epoch": 0.14669, + "grad_norm": 0.9468326311851436, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 14669 + }, + { + "epoch": 0.1467, + "grad_norm": 0.8864556572703375, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 14670 + }, + { + "epoch": 0.14671, + "grad_norm": 0.9495657902929036, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 14671 + }, + { + "epoch": 0.14672, + "grad_norm": 0.9735126737119133, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 14672 + }, + { + "epoch": 0.14673, + "grad_norm": 0.9317013267782335, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 14673 + }, + { + "epoch": 0.14674, + "grad_norm": 0.8267662344630435, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 14674 + }, + { + "epoch": 0.14675, + "grad_norm": 0.8465527100738274, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 14675 + }, + { + "epoch": 0.14676, + "grad_norm": 0.7975477618697624, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 14676 + }, + { + "epoch": 0.14677, + "grad_norm": 0.746888833612203, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 14677 + }, + { + "epoch": 0.14678, + "grad_norm": 0.9099746904431035, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 14678 + }, + { + "epoch": 0.14679, + "grad_norm": 0.9924469453796482, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 14679 + }, + { + "epoch": 0.1468, + "grad_norm": 1.0255541725347632, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 14680 + }, + { + "epoch": 0.14681, + "grad_norm": 0.9293619106149132, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 14681 + }, + { + "epoch": 0.14682, + "grad_norm": 0.8593043718586173, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 14682 + }, + { + "epoch": 0.14683, + "grad_norm": 0.9023966416140686, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 14683 + }, + { + "epoch": 0.14684, + "grad_norm": 0.9075070152522987, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 14684 + }, + { + "epoch": 0.14685, + "grad_norm": 0.990139660391972, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 14685 + }, + { + "epoch": 0.14686, + "grad_norm": 1.153361119676969, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 14686 + }, + { + "epoch": 0.14687, + "grad_norm": 1.0618144659524875, + "learning_rate": 0.003, + "loss": 4.082, + "step": 14687 + }, + { + "epoch": 0.14688, + "grad_norm": 1.1146777987509127, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 14688 + }, + { + "epoch": 0.14689, + "grad_norm": 0.809168480469712, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 14689 + }, + { + "epoch": 0.1469, + "grad_norm": 0.8571241803552251, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 14690 + }, + { + "epoch": 0.14691, + "grad_norm": 0.9309826043601989, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 14691 + }, + { + "epoch": 0.14692, + "grad_norm": 0.839841465329743, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 14692 + }, + { + "epoch": 0.14693, + "grad_norm": 0.696634463907577, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 14693 + }, + { + "epoch": 0.14694, + "grad_norm": 0.6860810439075222, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 14694 + }, + { + "epoch": 0.14695, + "grad_norm": 0.7375096967257634, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 14695 + }, + { + "epoch": 0.14696, + "grad_norm": 0.833826589352164, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 14696 + }, + { + "epoch": 0.14697, + "grad_norm": 0.985952008144028, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 14697 + }, + { + "epoch": 0.14698, + "grad_norm": 1.101613432328946, + "learning_rate": 0.003, + "loss": 4.057, + "step": 14698 + }, + { + "epoch": 0.14699, + "grad_norm": 0.8630794099808715, + "learning_rate": 0.003, + "loss": 4.034, + "step": 14699 + }, + { + "epoch": 0.147, + "grad_norm": 0.828157332753674, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 14700 + }, + { + "epoch": 0.14701, + "grad_norm": 0.7381105295462711, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 14701 + }, + { + "epoch": 0.14702, + "grad_norm": 0.7181426099842978, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 14702 + }, + { + "epoch": 0.14703, + "grad_norm": 0.7714457014561105, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 14703 + }, + { + "epoch": 0.14704, + "grad_norm": 0.8031825957815455, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 14704 + }, + { + "epoch": 0.14705, + "grad_norm": 0.8271898204374427, + "learning_rate": 0.003, + "loss": 4.062, + "step": 14705 + }, + { + "epoch": 0.14706, + "grad_norm": 0.7859278980344379, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 14706 + }, + { + "epoch": 0.14707, + "grad_norm": 0.9239315061288464, + "learning_rate": 0.003, + "loss": 4.079, + "step": 14707 + }, + { + "epoch": 0.14708, + "grad_norm": 1.036256286106465, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 14708 + }, + { + "epoch": 0.14709, + "grad_norm": 0.9262385110211995, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 14709 + }, + { + "epoch": 0.1471, + "grad_norm": 0.934436100353201, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 14710 + }, + { + "epoch": 0.14711, + "grad_norm": 1.0279129756159637, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 14711 + }, + { + "epoch": 0.14712, + "grad_norm": 1.0577358558193888, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 14712 + }, + { + "epoch": 0.14713, + "grad_norm": 1.0515648948673155, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 14713 + }, + { + "epoch": 0.14714, + "grad_norm": 0.8512134099767551, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 14714 + }, + { + "epoch": 0.14715, + "grad_norm": 0.7196155645320721, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 14715 + }, + { + "epoch": 0.14716, + "grad_norm": 0.6925672881321339, + "learning_rate": 0.003, + "loss": 4.098, + "step": 14716 + }, + { + "epoch": 0.14717, + "grad_norm": 0.7593293024180404, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 14717 + }, + { + "epoch": 0.14718, + "grad_norm": 0.870702469651855, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 14718 + }, + { + "epoch": 0.14719, + "grad_norm": 0.8852622342973488, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 14719 + }, + { + "epoch": 0.1472, + "grad_norm": 0.7412346900448475, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 14720 + }, + { + "epoch": 0.14721, + "grad_norm": 0.7934841697821211, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 14721 + }, + { + "epoch": 0.14722, + "grad_norm": 0.8274894305948751, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 14722 + }, + { + "epoch": 0.14723, + "grad_norm": 0.8076222834921906, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 14723 + }, + { + "epoch": 0.14724, + "grad_norm": 0.7713772507344148, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 14724 + }, + { + "epoch": 0.14725, + "grad_norm": 0.7802409897582635, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 14725 + }, + { + "epoch": 0.14726, + "grad_norm": 0.9637147817610195, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 14726 + }, + { + "epoch": 0.14727, + "grad_norm": 1.3358401635138504, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 14727 + }, + { + "epoch": 0.14728, + "grad_norm": 1.094002866839905, + "learning_rate": 0.003, + "loss": 4.061, + "step": 14728 + }, + { + "epoch": 0.14729, + "grad_norm": 1.064471378414021, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 14729 + }, + { + "epoch": 0.1473, + "grad_norm": 0.9103974869801705, + "learning_rate": 0.003, + "loss": 4.065, + "step": 14730 + }, + { + "epoch": 0.14731, + "grad_norm": 0.8250873471989922, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 14731 + }, + { + "epoch": 0.14732, + "grad_norm": 0.7599700541878261, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 14732 + }, + { + "epoch": 0.14733, + "grad_norm": 0.6575434766956358, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 14733 + }, + { + "epoch": 0.14734, + "grad_norm": 0.6050204566231443, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 14734 + }, + { + "epoch": 0.14735, + "grad_norm": 0.6940692057028003, + "learning_rate": 0.003, + "loss": 4.082, + "step": 14735 + }, + { + "epoch": 0.14736, + "grad_norm": 0.84017104510247, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 14736 + }, + { + "epoch": 0.14737, + "grad_norm": 0.8754131892508064, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 14737 + }, + { + "epoch": 0.14738, + "grad_norm": 0.9615225558629688, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 14738 + }, + { + "epoch": 0.14739, + "grad_norm": 1.1126273279398256, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 14739 + }, + { + "epoch": 0.1474, + "grad_norm": 0.8789448851738478, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 14740 + }, + { + "epoch": 0.14741, + "grad_norm": 0.7549636461614806, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 14741 + }, + { + "epoch": 0.14742, + "grad_norm": 0.7651498938045254, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 14742 + }, + { + "epoch": 0.14743, + "grad_norm": 0.8701006594439856, + "learning_rate": 0.003, + "loss": 4.087, + "step": 14743 + }, + { + "epoch": 0.14744, + "grad_norm": 0.9835486468749542, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 14744 + }, + { + "epoch": 0.14745, + "grad_norm": 1.0642331575158999, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 14745 + }, + { + "epoch": 0.14746, + "grad_norm": 1.0492294401106332, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 14746 + }, + { + "epoch": 0.14747, + "grad_norm": 0.9413981212070113, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 14747 + }, + { + "epoch": 0.14748, + "grad_norm": 0.9397069742006033, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 14748 + }, + { + "epoch": 0.14749, + "grad_norm": 1.11373609411613, + "learning_rate": 0.003, + "loss": 4.08, + "step": 14749 + }, + { + "epoch": 0.1475, + "grad_norm": 0.8865431370636784, + "learning_rate": 0.003, + "loss": 4.084, + "step": 14750 + }, + { + "epoch": 0.14751, + "grad_norm": 0.5582480524303068, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 14751 + }, + { + "epoch": 0.14752, + "grad_norm": 0.6777600396655671, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 14752 + }, + { + "epoch": 0.14753, + "grad_norm": 0.7008825384856778, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 14753 + }, + { + "epoch": 0.14754, + "grad_norm": 0.6260364997682627, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 14754 + }, + { + "epoch": 0.14755, + "grad_norm": 0.6217675935186241, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 14755 + }, + { + "epoch": 0.14756, + "grad_norm": 0.7067199034422367, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 14756 + }, + { + "epoch": 0.14757, + "grad_norm": 0.8345884707638084, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 14757 + }, + { + "epoch": 0.14758, + "grad_norm": 0.9419638110211473, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 14758 + }, + { + "epoch": 0.14759, + "grad_norm": 1.002513849179099, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 14759 + }, + { + "epoch": 0.1476, + "grad_norm": 1.0061704996101026, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 14760 + }, + { + "epoch": 0.14761, + "grad_norm": 0.878344839961622, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 14761 + }, + { + "epoch": 0.14762, + "grad_norm": 0.786588567570064, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 14762 + }, + { + "epoch": 0.14763, + "grad_norm": 0.7889398622042119, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 14763 + }, + { + "epoch": 0.14764, + "grad_norm": 0.7687087384937162, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 14764 + }, + { + "epoch": 0.14765, + "grad_norm": 0.7510031088264584, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 14765 + }, + { + "epoch": 0.14766, + "grad_norm": 0.8324646367403518, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 14766 + }, + { + "epoch": 0.14767, + "grad_norm": 0.8349003272011257, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 14767 + }, + { + "epoch": 0.14768, + "grad_norm": 0.9259044255796463, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 14768 + }, + { + "epoch": 0.14769, + "grad_norm": 1.0284923847717176, + "learning_rate": 0.003, + "loss": 4.057, + "step": 14769 + }, + { + "epoch": 0.1477, + "grad_norm": 0.976762133188089, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 14770 + }, + { + "epoch": 0.14771, + "grad_norm": 1.0032274857983992, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 14771 + }, + { + "epoch": 0.14772, + "grad_norm": 1.0694798217390225, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 14772 + }, + { + "epoch": 0.14773, + "grad_norm": 0.9811266736398486, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 14773 + }, + { + "epoch": 0.14774, + "grad_norm": 0.9451081301158704, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 14774 + }, + { + "epoch": 0.14775, + "grad_norm": 0.9195920238594352, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 14775 + }, + { + "epoch": 0.14776, + "grad_norm": 1.045619909819611, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 14776 + }, + { + "epoch": 0.14777, + "grad_norm": 1.1195692749302386, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 14777 + }, + { + "epoch": 0.14778, + "grad_norm": 0.9256250140321228, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 14778 + }, + { + "epoch": 0.14779, + "grad_norm": 0.8785218642952807, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 14779 + }, + { + "epoch": 0.1478, + "grad_norm": 0.7851239946189108, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 14780 + }, + { + "epoch": 0.14781, + "grad_norm": 0.78004579180118, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 14781 + }, + { + "epoch": 0.14782, + "grad_norm": 0.8543830696121397, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 14782 + }, + { + "epoch": 0.14783, + "grad_norm": 0.8838195530462661, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 14783 + }, + { + "epoch": 0.14784, + "grad_norm": 1.0217768222772443, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 14784 + }, + { + "epoch": 0.14785, + "grad_norm": 1.0514376758786688, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 14785 + }, + { + "epoch": 0.14786, + "grad_norm": 0.9606875671622199, + "learning_rate": 0.003, + "loss": 4.075, + "step": 14786 + }, + { + "epoch": 0.14787, + "grad_norm": 0.9243655458910466, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 14787 + }, + { + "epoch": 0.14788, + "grad_norm": 0.7511045449334031, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 14788 + }, + { + "epoch": 0.14789, + "grad_norm": 0.7090628535047875, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 14789 + }, + { + "epoch": 0.1479, + "grad_norm": 0.7319883043158829, + "learning_rate": 0.003, + "loss": 4.064, + "step": 14790 + }, + { + "epoch": 0.14791, + "grad_norm": 0.7145543038042341, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 14791 + }, + { + "epoch": 0.14792, + "grad_norm": 0.8122521791205854, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 14792 + }, + { + "epoch": 0.14793, + "grad_norm": 0.8448586491217989, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 14793 + }, + { + "epoch": 0.14794, + "grad_norm": 0.9455116940514773, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 14794 + }, + { + "epoch": 0.14795, + "grad_norm": 0.9706749144712972, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 14795 + }, + { + "epoch": 0.14796, + "grad_norm": 0.974879979185396, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 14796 + }, + { + "epoch": 0.14797, + "grad_norm": 1.1141913255546896, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 14797 + }, + { + "epoch": 0.14798, + "grad_norm": 1.0529221422179145, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 14798 + }, + { + "epoch": 0.14799, + "grad_norm": 0.9894264927034992, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 14799 + }, + { + "epoch": 0.148, + "grad_norm": 1.0194162328251322, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 14800 + }, + { + "epoch": 0.14801, + "grad_norm": 0.9302982935422128, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 14801 + }, + { + "epoch": 0.14802, + "grad_norm": 1.0780535412079117, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 14802 + }, + { + "epoch": 0.14803, + "grad_norm": 0.9887225458379839, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 14803 + }, + { + "epoch": 0.14804, + "grad_norm": 0.9716411164392459, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 14804 + }, + { + "epoch": 0.14805, + "grad_norm": 0.8637497964270253, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 14805 + }, + { + "epoch": 0.14806, + "grad_norm": 0.7903898273337917, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 14806 + }, + { + "epoch": 0.14807, + "grad_norm": 0.8490409046573257, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 14807 + }, + { + "epoch": 0.14808, + "grad_norm": 1.0034610976823193, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 14808 + }, + { + "epoch": 0.14809, + "grad_norm": 1.1816087044358994, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 14809 + }, + { + "epoch": 0.1481, + "grad_norm": 0.9064930970852736, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 14810 + }, + { + "epoch": 0.14811, + "grad_norm": 0.8516828867538914, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 14811 + }, + { + "epoch": 0.14812, + "grad_norm": 0.8061801702923695, + "learning_rate": 0.003, + "loss": 4.027, + "step": 14812 + }, + { + "epoch": 0.14813, + "grad_norm": 0.8251069162496621, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 14813 + }, + { + "epoch": 0.14814, + "grad_norm": 0.8553441504363729, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 14814 + }, + { + "epoch": 0.14815, + "grad_norm": 1.0016757800662375, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 14815 + }, + { + "epoch": 0.14816, + "grad_norm": 1.0346234884618168, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 14816 + }, + { + "epoch": 0.14817, + "grad_norm": 0.7749126658491023, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 14817 + }, + { + "epoch": 0.14818, + "grad_norm": 0.7200310165662526, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 14818 + }, + { + "epoch": 0.14819, + "grad_norm": 0.8394696666783071, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 14819 + }, + { + "epoch": 0.1482, + "grad_norm": 0.9057421912259922, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 14820 + }, + { + "epoch": 0.14821, + "grad_norm": 0.8886738161507868, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 14821 + }, + { + "epoch": 0.14822, + "grad_norm": 1.045143250935779, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 14822 + }, + { + "epoch": 0.14823, + "grad_norm": 0.8295574752948383, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 14823 + }, + { + "epoch": 0.14824, + "grad_norm": 0.7083040585887035, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 14824 + }, + { + "epoch": 0.14825, + "grad_norm": 0.7850557599282676, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 14825 + }, + { + "epoch": 0.14826, + "grad_norm": 0.7667351367600366, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 14826 + }, + { + "epoch": 0.14827, + "grad_norm": 0.8439361869003991, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 14827 + }, + { + "epoch": 0.14828, + "grad_norm": 0.9527190140415227, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 14828 + }, + { + "epoch": 0.14829, + "grad_norm": 0.9447239642238727, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 14829 + }, + { + "epoch": 0.1483, + "grad_norm": 0.8747502536455956, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 14830 + }, + { + "epoch": 0.14831, + "grad_norm": 0.8136418171167563, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 14831 + }, + { + "epoch": 0.14832, + "grad_norm": 0.7488638523264456, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 14832 + }, + { + "epoch": 0.14833, + "grad_norm": 0.7674862775383623, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 14833 + }, + { + "epoch": 0.14834, + "grad_norm": 0.8227403242309463, + "learning_rate": 0.003, + "loss": 4.075, + "step": 14834 + }, + { + "epoch": 0.14835, + "grad_norm": 0.8244965620341915, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 14835 + }, + { + "epoch": 0.14836, + "grad_norm": 0.7292023786298032, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 14836 + }, + { + "epoch": 0.14837, + "grad_norm": 0.634328887281756, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 14837 + }, + { + "epoch": 0.14838, + "grad_norm": 0.6310999363266393, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 14838 + }, + { + "epoch": 0.14839, + "grad_norm": 0.7649104871170423, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 14839 + }, + { + "epoch": 0.1484, + "grad_norm": 0.9326303923828554, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 14840 + }, + { + "epoch": 0.14841, + "grad_norm": 1.0164941161077334, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 14841 + }, + { + "epoch": 0.14842, + "grad_norm": 0.9786120246103639, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 14842 + }, + { + "epoch": 0.14843, + "grad_norm": 1.0209791174342724, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 14843 + }, + { + "epoch": 0.14844, + "grad_norm": 0.9639623705046754, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 14844 + }, + { + "epoch": 0.14845, + "grad_norm": 0.9282902973995008, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 14845 + }, + { + "epoch": 0.14846, + "grad_norm": 0.9217950902227127, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 14846 + }, + { + "epoch": 0.14847, + "grad_norm": 0.8824808881635553, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 14847 + }, + { + "epoch": 0.14848, + "grad_norm": 0.8450670052059192, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 14848 + }, + { + "epoch": 0.14849, + "grad_norm": 0.8121697588609396, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 14849 + }, + { + "epoch": 0.1485, + "grad_norm": 0.8250310663417711, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 14850 + }, + { + "epoch": 0.14851, + "grad_norm": 1.050910913970291, + "learning_rate": 0.003, + "loss": 4.071, + "step": 14851 + }, + { + "epoch": 0.14852, + "grad_norm": 0.9577614016696363, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 14852 + }, + { + "epoch": 0.14853, + "grad_norm": 0.7960357516960352, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 14853 + }, + { + "epoch": 0.14854, + "grad_norm": 0.8254845284606344, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 14854 + }, + { + "epoch": 0.14855, + "grad_norm": 0.8080456450538953, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 14855 + }, + { + "epoch": 0.14856, + "grad_norm": 0.7753398929805875, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 14856 + }, + { + "epoch": 0.14857, + "grad_norm": 0.7061527106788822, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 14857 + }, + { + "epoch": 0.14858, + "grad_norm": 0.5800571865656208, + "learning_rate": 0.003, + "loss": 4.075, + "step": 14858 + }, + { + "epoch": 0.14859, + "grad_norm": 0.5832967010914923, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 14859 + }, + { + "epoch": 0.1486, + "grad_norm": 0.6681652966369956, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 14860 + }, + { + "epoch": 0.14861, + "grad_norm": 0.9662894337646466, + "learning_rate": 0.003, + "loss": 4.072, + "step": 14861 + }, + { + "epoch": 0.14862, + "grad_norm": 1.4940736064093287, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 14862 + }, + { + "epoch": 0.14863, + "grad_norm": 0.48684413451456354, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 14863 + }, + { + "epoch": 0.14864, + "grad_norm": 0.9214178308312404, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 14864 + }, + { + "epoch": 0.14865, + "grad_norm": 1.1051662933711708, + "learning_rate": 0.003, + "loss": 4.1053, + "step": 14865 + }, + { + "epoch": 0.14866, + "grad_norm": 0.6444855888481819, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 14866 + }, + { + "epoch": 0.14867, + "grad_norm": 0.7012195739490744, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 14867 + }, + { + "epoch": 0.14868, + "grad_norm": 0.6651698771785198, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 14868 + }, + { + "epoch": 0.14869, + "grad_norm": 0.7190387626118099, + "learning_rate": 0.003, + "loss": 4.069, + "step": 14869 + }, + { + "epoch": 0.1487, + "grad_norm": 0.7160446661129076, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 14870 + }, + { + "epoch": 0.14871, + "grad_norm": 0.7145206533123866, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 14871 + }, + { + "epoch": 0.14872, + "grad_norm": 0.8690487507205731, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 14872 + }, + { + "epoch": 0.14873, + "grad_norm": 1.0177862633302825, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 14873 + }, + { + "epoch": 0.14874, + "grad_norm": 1.04667369319081, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 14874 + }, + { + "epoch": 0.14875, + "grad_norm": 1.0012624441665312, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 14875 + }, + { + "epoch": 0.14876, + "grad_norm": 1.0680376580115962, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 14876 + }, + { + "epoch": 0.14877, + "grad_norm": 0.9653137634165351, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 14877 + }, + { + "epoch": 0.14878, + "grad_norm": 0.9771121139268824, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 14878 + }, + { + "epoch": 0.14879, + "grad_norm": 1.036955708787152, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 14879 + }, + { + "epoch": 0.1488, + "grad_norm": 0.9779097238863572, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 14880 + }, + { + "epoch": 0.14881, + "grad_norm": 0.9135611922651338, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 14881 + }, + { + "epoch": 0.14882, + "grad_norm": 1.0105999223406752, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 14882 + }, + { + "epoch": 0.14883, + "grad_norm": 1.0593121256665727, + "learning_rate": 0.003, + "loss": 4.081, + "step": 14883 + }, + { + "epoch": 0.14884, + "grad_norm": 0.9045938409237391, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 14884 + }, + { + "epoch": 0.14885, + "grad_norm": 0.8188998322728103, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 14885 + }, + { + "epoch": 0.14886, + "grad_norm": 0.7203265686049506, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 14886 + }, + { + "epoch": 0.14887, + "grad_norm": 0.7621853329044047, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 14887 + }, + { + "epoch": 0.14888, + "grad_norm": 0.7428639111160718, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 14888 + }, + { + "epoch": 0.14889, + "grad_norm": 0.688946974269044, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 14889 + }, + { + "epoch": 0.1489, + "grad_norm": 0.7727991382090984, + "learning_rate": 0.003, + "loss": 4.061, + "step": 14890 + }, + { + "epoch": 0.14891, + "grad_norm": 0.923764079240288, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 14891 + }, + { + "epoch": 0.14892, + "grad_norm": 1.0818850695456843, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 14892 + }, + { + "epoch": 0.14893, + "grad_norm": 0.9533515493596308, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 14893 + }, + { + "epoch": 0.14894, + "grad_norm": 0.981481341965367, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 14894 + }, + { + "epoch": 0.14895, + "grad_norm": 0.9285268392525123, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 14895 + }, + { + "epoch": 0.14896, + "grad_norm": 0.9065692112969851, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 14896 + }, + { + "epoch": 0.14897, + "grad_norm": 0.9286869561404768, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 14897 + }, + { + "epoch": 0.14898, + "grad_norm": 0.9693103568044311, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 14898 + }, + { + "epoch": 0.14899, + "grad_norm": 1.2318849337713658, + "learning_rate": 0.003, + "loss": 4.074, + "step": 14899 + }, + { + "epoch": 0.149, + "grad_norm": 0.7794046979508626, + "learning_rate": 0.003, + "loss": 4.07, + "step": 14900 + }, + { + "epoch": 0.14901, + "grad_norm": 0.8651071691592058, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 14901 + }, + { + "epoch": 0.14902, + "grad_norm": 0.9940404816213152, + "learning_rate": 0.003, + "loss": 4.094, + "step": 14902 + }, + { + "epoch": 0.14903, + "grad_norm": 0.9841659656864069, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 14903 + }, + { + "epoch": 0.14904, + "grad_norm": 0.9594197113729875, + "learning_rate": 0.003, + "loss": 4.106, + "step": 14904 + }, + { + "epoch": 0.14905, + "grad_norm": 1.0375522482259765, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 14905 + }, + { + "epoch": 0.14906, + "grad_norm": 0.9748921579520625, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 14906 + }, + { + "epoch": 0.14907, + "grad_norm": 0.9352250012513379, + "learning_rate": 0.003, + "loss": 4.085, + "step": 14907 + }, + { + "epoch": 0.14908, + "grad_norm": 0.9259155938051827, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 14908 + }, + { + "epoch": 0.14909, + "grad_norm": 0.8927050335806, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 14909 + }, + { + "epoch": 0.1491, + "grad_norm": 0.7725386609308739, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 14910 + }, + { + "epoch": 0.14911, + "grad_norm": 0.6701881124744672, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 14911 + }, + { + "epoch": 0.14912, + "grad_norm": 0.7280153131448277, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 14912 + }, + { + "epoch": 0.14913, + "grad_norm": 0.8517887732778402, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 14913 + }, + { + "epoch": 0.14914, + "grad_norm": 0.9656980157846073, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 14914 + }, + { + "epoch": 0.14915, + "grad_norm": 1.0086694536682543, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 14915 + }, + { + "epoch": 0.14916, + "grad_norm": 0.852154984227591, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 14916 + }, + { + "epoch": 0.14917, + "grad_norm": 0.7569734589829502, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 14917 + }, + { + "epoch": 0.14918, + "grad_norm": 0.6979820605494047, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 14918 + }, + { + "epoch": 0.14919, + "grad_norm": 0.6633397643554763, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 14919 + }, + { + "epoch": 0.1492, + "grad_norm": 0.619787494363792, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 14920 + }, + { + "epoch": 0.14921, + "grad_norm": 0.608910457091843, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 14921 + }, + { + "epoch": 0.14922, + "grad_norm": 0.6255806418384025, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 14922 + }, + { + "epoch": 0.14923, + "grad_norm": 0.7034802121224089, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 14923 + }, + { + "epoch": 0.14924, + "grad_norm": 0.7120507732000536, + "learning_rate": 0.003, + "loss": 4.041, + "step": 14924 + }, + { + "epoch": 0.14925, + "grad_norm": 0.731759045849398, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 14925 + }, + { + "epoch": 0.14926, + "grad_norm": 0.7958735905441341, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 14926 + }, + { + "epoch": 0.14927, + "grad_norm": 0.974310767132195, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 14927 + }, + { + "epoch": 0.14928, + "grad_norm": 1.1300574818742186, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 14928 + }, + { + "epoch": 0.14929, + "grad_norm": 0.6874079550294511, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 14929 + }, + { + "epoch": 0.1493, + "grad_norm": 0.5768107042264596, + "learning_rate": 0.003, + "loss": 4.052, + "step": 14930 + }, + { + "epoch": 0.14931, + "grad_norm": 0.6451752914617287, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 14931 + }, + { + "epoch": 0.14932, + "grad_norm": 0.666473932481538, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 14932 + }, + { + "epoch": 0.14933, + "grad_norm": 0.7766091975960613, + "learning_rate": 0.003, + "loss": 4.05, + "step": 14933 + }, + { + "epoch": 0.14934, + "grad_norm": 0.8452392387063843, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 14934 + }, + { + "epoch": 0.14935, + "grad_norm": 0.9864910485720647, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 14935 + }, + { + "epoch": 0.14936, + "grad_norm": 1.0719299273206342, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 14936 + }, + { + "epoch": 0.14937, + "grad_norm": 0.9071005461209528, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 14937 + }, + { + "epoch": 0.14938, + "grad_norm": 0.8405463476567478, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 14938 + }, + { + "epoch": 0.14939, + "grad_norm": 0.8575181261962258, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 14939 + }, + { + "epoch": 0.1494, + "grad_norm": 0.8796580025409219, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 14940 + }, + { + "epoch": 0.14941, + "grad_norm": 0.8635099200038631, + "learning_rate": 0.003, + "loss": 4.105, + "step": 14941 + }, + { + "epoch": 0.14942, + "grad_norm": 0.765315575062824, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 14942 + }, + { + "epoch": 0.14943, + "grad_norm": 0.7096407636591316, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 14943 + }, + { + "epoch": 0.14944, + "grad_norm": 0.7977257960695163, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 14944 + }, + { + "epoch": 0.14945, + "grad_norm": 0.782795689153648, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 14945 + }, + { + "epoch": 0.14946, + "grad_norm": 0.7725557924540706, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 14946 + }, + { + "epoch": 0.14947, + "grad_norm": 0.8471655105920625, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 14947 + }, + { + "epoch": 0.14948, + "grad_norm": 1.031115687120366, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 14948 + }, + { + "epoch": 0.14949, + "grad_norm": 1.1187946877050938, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 14949 + }, + { + "epoch": 0.1495, + "grad_norm": 0.8942213332346919, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 14950 + }, + { + "epoch": 0.14951, + "grad_norm": 0.9204781667982811, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 14951 + }, + { + "epoch": 0.14952, + "grad_norm": 0.9352311395830009, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 14952 + }, + { + "epoch": 0.14953, + "grad_norm": 1.0093833778732473, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 14953 + }, + { + "epoch": 0.14954, + "grad_norm": 1.0478771341499364, + "learning_rate": 0.003, + "loss": 4.112, + "step": 14954 + }, + { + "epoch": 0.14955, + "grad_norm": 0.882340612382404, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 14955 + }, + { + "epoch": 0.14956, + "grad_norm": 0.8578629575346205, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 14956 + }, + { + "epoch": 0.14957, + "grad_norm": 0.9048449868787202, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 14957 + }, + { + "epoch": 0.14958, + "grad_norm": 0.9866701700073888, + "learning_rate": 0.003, + "loss": 4.103, + "step": 14958 + }, + { + "epoch": 0.14959, + "grad_norm": 1.0395924027785424, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 14959 + }, + { + "epoch": 0.1496, + "grad_norm": 1.0718346439433137, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 14960 + }, + { + "epoch": 0.14961, + "grad_norm": 1.017150832806336, + "learning_rate": 0.003, + "loss": 4.075, + "step": 14961 + }, + { + "epoch": 0.14962, + "grad_norm": 1.0949210508719271, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 14962 + }, + { + "epoch": 0.14963, + "grad_norm": 0.8677949581703225, + "learning_rate": 0.003, + "loss": 4.076, + "step": 14963 + }, + { + "epoch": 0.14964, + "grad_norm": 0.8208694744407704, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 14964 + }, + { + "epoch": 0.14965, + "grad_norm": 0.7672724181788401, + "learning_rate": 0.003, + "loss": 4.067, + "step": 14965 + }, + { + "epoch": 0.14966, + "grad_norm": 0.7334087592400507, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 14966 + }, + { + "epoch": 0.14967, + "grad_norm": 0.8592385250407862, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 14967 + }, + { + "epoch": 0.14968, + "grad_norm": 0.9927538887648965, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 14968 + }, + { + "epoch": 0.14969, + "grad_norm": 0.9994553782115532, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 14969 + }, + { + "epoch": 0.1497, + "grad_norm": 1.047072417223147, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 14970 + }, + { + "epoch": 0.14971, + "grad_norm": 1.076232570183918, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 14971 + }, + { + "epoch": 0.14972, + "grad_norm": 0.9348679591768919, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 14972 + }, + { + "epoch": 0.14973, + "grad_norm": 0.8771961907491908, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 14973 + }, + { + "epoch": 0.14974, + "grad_norm": 0.9099521244080998, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 14974 + }, + { + "epoch": 0.14975, + "grad_norm": 0.9894242589418759, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 14975 + }, + { + "epoch": 0.14976, + "grad_norm": 1.1150719540092562, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 14976 + }, + { + "epoch": 0.14977, + "grad_norm": 0.9052581333802919, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 14977 + }, + { + "epoch": 0.14978, + "grad_norm": 0.8720960042061171, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 14978 + }, + { + "epoch": 0.14979, + "grad_norm": 0.9141304591155341, + "learning_rate": 0.003, + "loss": 4.075, + "step": 14979 + }, + { + "epoch": 0.1498, + "grad_norm": 0.8629320719816637, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 14980 + }, + { + "epoch": 0.14981, + "grad_norm": 0.8454396606444087, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 14981 + }, + { + "epoch": 0.14982, + "grad_norm": 0.8829463599536927, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 14982 + }, + { + "epoch": 0.14983, + "grad_norm": 0.9891816977960454, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 14983 + }, + { + "epoch": 0.14984, + "grad_norm": 0.9984058079787899, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 14984 + }, + { + "epoch": 0.14985, + "grad_norm": 0.7629448942608905, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 14985 + }, + { + "epoch": 0.14986, + "grad_norm": 0.6394039871849966, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 14986 + }, + { + "epoch": 0.14987, + "grad_norm": 0.6582736174880554, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 14987 + }, + { + "epoch": 0.14988, + "grad_norm": 0.6674299029980075, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 14988 + }, + { + "epoch": 0.14989, + "grad_norm": 0.6980416705713386, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 14989 + }, + { + "epoch": 0.1499, + "grad_norm": 0.6913745885470995, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 14990 + }, + { + "epoch": 0.14991, + "grad_norm": 0.6007846402580602, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 14991 + }, + { + "epoch": 0.14992, + "grad_norm": 0.5408813537938235, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 14992 + }, + { + "epoch": 0.14993, + "grad_norm": 0.5267819341584272, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 14993 + }, + { + "epoch": 0.14994, + "grad_norm": 0.5165303340641763, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 14994 + }, + { + "epoch": 0.14995, + "grad_norm": 0.5942808753876797, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 14995 + }, + { + "epoch": 0.14996, + "grad_norm": 0.7343045389402316, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 14996 + }, + { + "epoch": 0.14997, + "grad_norm": 0.7920155719691849, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 14997 + }, + { + "epoch": 0.14998, + "grad_norm": 0.9447915131845857, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 14998 + }, + { + "epoch": 0.14999, + "grad_norm": 1.3156869354631486, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 14999 + }, + { + "epoch": 0.15, + "grad_norm": 0.7348273108340998, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 15000 + }, + { + "epoch": 0.15001, + "grad_norm": 0.6559891082187708, + "learning_rate": 0.003, + "loss": 4.055, + "step": 15001 + }, + { + "epoch": 0.15002, + "grad_norm": 0.707503634726269, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 15002 + }, + { + "epoch": 0.15003, + "grad_norm": 0.726740953568803, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 15003 + }, + { + "epoch": 0.15004, + "grad_norm": 0.926987121955849, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 15004 + }, + { + "epoch": 0.15005, + "grad_norm": 1.169706646737343, + "learning_rate": 0.003, + "loss": 4.096, + "step": 15005 + }, + { + "epoch": 0.15006, + "grad_norm": 0.858629392878249, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 15006 + }, + { + "epoch": 0.15007, + "grad_norm": 0.8710255151395264, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 15007 + }, + { + "epoch": 0.15008, + "grad_norm": 0.9323288184785412, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 15008 + }, + { + "epoch": 0.15009, + "grad_norm": 0.8016143464855188, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 15009 + }, + { + "epoch": 0.1501, + "grad_norm": 0.880337039559984, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 15010 + }, + { + "epoch": 0.15011, + "grad_norm": 1.0895991072764721, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 15011 + }, + { + "epoch": 0.15012, + "grad_norm": 1.1519094860806856, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 15012 + }, + { + "epoch": 0.15013, + "grad_norm": 0.8339136951685342, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 15013 + }, + { + "epoch": 0.15014, + "grad_norm": 0.7312198192986489, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 15014 + }, + { + "epoch": 0.15015, + "grad_norm": 0.7179300708126604, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 15015 + }, + { + "epoch": 0.15016, + "grad_norm": 0.8808797372870163, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 15016 + }, + { + "epoch": 0.15017, + "grad_norm": 1.1218937776661737, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 15017 + }, + { + "epoch": 0.15018, + "grad_norm": 0.8977155710382226, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 15018 + }, + { + "epoch": 0.15019, + "grad_norm": 0.8109154105901505, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 15019 + }, + { + "epoch": 0.1502, + "grad_norm": 0.8098722962407195, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 15020 + }, + { + "epoch": 0.15021, + "grad_norm": 0.8669939347458737, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 15021 + }, + { + "epoch": 0.15022, + "grad_norm": 1.0375664805207678, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 15022 + }, + { + "epoch": 0.15023, + "grad_norm": 1.0426634912626966, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 15023 + }, + { + "epoch": 0.15024, + "grad_norm": 0.8736846127438801, + "learning_rate": 0.003, + "loss": 4.049, + "step": 15024 + }, + { + "epoch": 0.15025, + "grad_norm": 0.9437814264314455, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 15025 + }, + { + "epoch": 0.15026, + "grad_norm": 1.1321860388220588, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 15026 + }, + { + "epoch": 0.15027, + "grad_norm": 1.0450166618111094, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 15027 + }, + { + "epoch": 0.15028, + "grad_norm": 0.874551180787626, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 15028 + }, + { + "epoch": 0.15029, + "grad_norm": 0.769974405546577, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 15029 + }, + { + "epoch": 0.1503, + "grad_norm": 0.7224279390896323, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 15030 + }, + { + "epoch": 0.15031, + "grad_norm": 0.6880650816822179, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 15031 + }, + { + "epoch": 0.15032, + "grad_norm": 0.6260300798186833, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 15032 + }, + { + "epoch": 0.15033, + "grad_norm": 0.6076110821161129, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 15033 + }, + { + "epoch": 0.15034, + "grad_norm": 0.727019230585347, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 15034 + }, + { + "epoch": 0.15035, + "grad_norm": 0.8451350538115437, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 15035 + }, + { + "epoch": 0.15036, + "grad_norm": 0.9391333908635195, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 15036 + }, + { + "epoch": 0.15037, + "grad_norm": 0.9568895429578911, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 15037 + }, + { + "epoch": 0.15038, + "grad_norm": 1.131316642422788, + "learning_rate": 0.003, + "loss": 4.092, + "step": 15038 + }, + { + "epoch": 0.15039, + "grad_norm": 1.0820982816406701, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 15039 + }, + { + "epoch": 0.1504, + "grad_norm": 1.0106112529513311, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 15040 + }, + { + "epoch": 0.15041, + "grad_norm": 0.9521076511768249, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 15041 + }, + { + "epoch": 0.15042, + "grad_norm": 0.920015805394499, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 15042 + }, + { + "epoch": 0.15043, + "grad_norm": 0.7552979883785288, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 15043 + }, + { + "epoch": 0.15044, + "grad_norm": 0.8342374556027915, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 15044 + }, + { + "epoch": 0.15045, + "grad_norm": 0.8943774965335569, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 15045 + }, + { + "epoch": 0.15046, + "grad_norm": 1.0531437113688678, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 15046 + }, + { + "epoch": 0.15047, + "grad_norm": 1.0180501141105878, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 15047 + }, + { + "epoch": 0.15048, + "grad_norm": 0.9121791262945473, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 15048 + }, + { + "epoch": 0.15049, + "grad_norm": 0.945566995431633, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 15049 + }, + { + "epoch": 0.1505, + "grad_norm": 0.8699715498075093, + "learning_rate": 0.003, + "loss": 4.081, + "step": 15050 + }, + { + "epoch": 0.15051, + "grad_norm": 0.8516562362658695, + "learning_rate": 0.003, + "loss": 4.042, + "step": 15051 + }, + { + "epoch": 0.15052, + "grad_norm": 0.847357439658091, + "learning_rate": 0.003, + "loss": 4.091, + "step": 15052 + }, + { + "epoch": 0.15053, + "grad_norm": 0.8539737558488057, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 15053 + }, + { + "epoch": 0.15054, + "grad_norm": 1.000812720741745, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 15054 + }, + { + "epoch": 0.15055, + "grad_norm": 1.1126528375972191, + "learning_rate": 0.003, + "loss": 4.085, + "step": 15055 + }, + { + "epoch": 0.15056, + "grad_norm": 0.7977120214576124, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 15056 + }, + { + "epoch": 0.15057, + "grad_norm": 0.7419229424382847, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 15057 + }, + { + "epoch": 0.15058, + "grad_norm": 0.7838019486444135, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 15058 + }, + { + "epoch": 0.15059, + "grad_norm": 0.7332083959605218, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 15059 + }, + { + "epoch": 0.1506, + "grad_norm": 0.7455840825888586, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 15060 + }, + { + "epoch": 0.15061, + "grad_norm": 0.7721669725707283, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 15061 + }, + { + "epoch": 0.15062, + "grad_norm": 0.9315782636727437, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 15062 + }, + { + "epoch": 0.15063, + "grad_norm": 1.0829324772663194, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 15063 + }, + { + "epoch": 0.15064, + "grad_norm": 1.0719702137178482, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 15064 + }, + { + "epoch": 0.15065, + "grad_norm": 0.9590853495894157, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 15065 + }, + { + "epoch": 0.15066, + "grad_norm": 0.9782700242694625, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 15066 + }, + { + "epoch": 0.15067, + "grad_norm": 0.9117010656159933, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 15067 + }, + { + "epoch": 0.15068, + "grad_norm": 0.8235517729639226, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 15068 + }, + { + "epoch": 0.15069, + "grad_norm": 0.8441872681964596, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 15069 + }, + { + "epoch": 0.1507, + "grad_norm": 0.7525765817838392, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 15070 + }, + { + "epoch": 0.15071, + "grad_norm": 0.8453931881506186, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 15071 + }, + { + "epoch": 0.15072, + "grad_norm": 0.8636282769401511, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 15072 + }, + { + "epoch": 0.15073, + "grad_norm": 0.9829775407411315, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 15073 + }, + { + "epoch": 0.15074, + "grad_norm": 1.0059675353774253, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 15074 + }, + { + "epoch": 0.15075, + "grad_norm": 0.8957322165552001, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 15075 + }, + { + "epoch": 0.15076, + "grad_norm": 0.853476197375503, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 15076 + }, + { + "epoch": 0.15077, + "grad_norm": 0.7950408125840406, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 15077 + }, + { + "epoch": 0.15078, + "grad_norm": 0.8142917115529535, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 15078 + }, + { + "epoch": 0.15079, + "grad_norm": 0.8477090125509233, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 15079 + }, + { + "epoch": 0.1508, + "grad_norm": 0.7663104448708389, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 15080 + }, + { + "epoch": 0.15081, + "grad_norm": 0.7397684908572042, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 15081 + }, + { + "epoch": 0.15082, + "grad_norm": 0.8354558800377258, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 15082 + }, + { + "epoch": 0.15083, + "grad_norm": 0.6910626555141486, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 15083 + }, + { + "epoch": 0.15084, + "grad_norm": 0.7293989807468484, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 15084 + }, + { + "epoch": 0.15085, + "grad_norm": 0.7642813294138431, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 15085 + }, + { + "epoch": 0.15086, + "grad_norm": 0.8882841442194418, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 15086 + }, + { + "epoch": 0.15087, + "grad_norm": 1.0427765998507974, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 15087 + }, + { + "epoch": 0.15088, + "grad_norm": 1.1424136012532162, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 15088 + }, + { + "epoch": 0.15089, + "grad_norm": 0.9658202607279203, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 15089 + }, + { + "epoch": 0.1509, + "grad_norm": 1.0009322490904504, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 15090 + }, + { + "epoch": 0.15091, + "grad_norm": 1.0655670822265935, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 15091 + }, + { + "epoch": 0.15092, + "grad_norm": 1.0325834214140834, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 15092 + }, + { + "epoch": 0.15093, + "grad_norm": 1.0279871288626194, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 15093 + }, + { + "epoch": 0.15094, + "grad_norm": 0.976515820218539, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 15094 + }, + { + "epoch": 0.15095, + "grad_norm": 1.0273089457940514, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 15095 + }, + { + "epoch": 0.15096, + "grad_norm": 1.0210313870405392, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 15096 + }, + { + "epoch": 0.15097, + "grad_norm": 1.0763912056309544, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 15097 + }, + { + "epoch": 0.15098, + "grad_norm": 1.0183582632170034, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 15098 + }, + { + "epoch": 0.15099, + "grad_norm": 1.0133441682251516, + "learning_rate": 0.003, + "loss": 4.074, + "step": 15099 + }, + { + "epoch": 0.151, + "grad_norm": 1.014058941118107, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 15100 + }, + { + "epoch": 0.15101, + "grad_norm": 0.9433122305218627, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 15101 + }, + { + "epoch": 0.15102, + "grad_norm": 0.905602098977912, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 15102 + }, + { + "epoch": 0.15103, + "grad_norm": 0.7943329362958932, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 15103 + }, + { + "epoch": 0.15104, + "grad_norm": 0.9579959372806925, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 15104 + }, + { + "epoch": 0.15105, + "grad_norm": 1.0770051648263383, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 15105 + }, + { + "epoch": 0.15106, + "grad_norm": 0.9924025895609426, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 15106 + }, + { + "epoch": 0.15107, + "grad_norm": 1.021375144229543, + "learning_rate": 0.003, + "loss": 4.07, + "step": 15107 + }, + { + "epoch": 0.15108, + "grad_norm": 0.9387463115084094, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 15108 + }, + { + "epoch": 0.15109, + "grad_norm": 0.9006975014037859, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 15109 + }, + { + "epoch": 0.1511, + "grad_norm": 0.8538120284930222, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 15110 + }, + { + "epoch": 0.15111, + "grad_norm": 0.8692401582453622, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 15111 + }, + { + "epoch": 0.15112, + "grad_norm": 0.8035820239386009, + "learning_rate": 0.003, + "loss": 4.067, + "step": 15112 + }, + { + "epoch": 0.15113, + "grad_norm": 0.7560033114405544, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 15113 + }, + { + "epoch": 0.15114, + "grad_norm": 0.7324938727488074, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 15114 + }, + { + "epoch": 0.15115, + "grad_norm": 0.8000574554573201, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 15115 + }, + { + "epoch": 0.15116, + "grad_norm": 0.756042649955105, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 15116 + }, + { + "epoch": 0.15117, + "grad_norm": 0.7620121160526505, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 15117 + }, + { + "epoch": 0.15118, + "grad_norm": 0.7799876604078111, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 15118 + }, + { + "epoch": 0.15119, + "grad_norm": 0.9251279569899423, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 15119 + }, + { + "epoch": 0.1512, + "grad_norm": 1.0591749364671184, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 15120 + }, + { + "epoch": 0.15121, + "grad_norm": 1.0544780767081794, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 15121 + }, + { + "epoch": 0.15122, + "grad_norm": 0.8011775350217684, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 15122 + }, + { + "epoch": 0.15123, + "grad_norm": 0.7048887305335355, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 15123 + }, + { + "epoch": 0.15124, + "grad_norm": 0.6646916324277997, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 15124 + }, + { + "epoch": 0.15125, + "grad_norm": 0.5997067875706642, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 15125 + }, + { + "epoch": 0.15126, + "grad_norm": 0.5284961902011485, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 15126 + }, + { + "epoch": 0.15127, + "grad_norm": 0.49008238412079774, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 15127 + }, + { + "epoch": 0.15128, + "grad_norm": 0.4691905705157153, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 15128 + }, + { + "epoch": 0.15129, + "grad_norm": 0.4792073352967458, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 15129 + }, + { + "epoch": 0.1513, + "grad_norm": 0.5133091416230263, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 15130 + }, + { + "epoch": 0.15131, + "grad_norm": 0.48283337225191414, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 15131 + }, + { + "epoch": 0.15132, + "grad_norm": 0.5550133025090831, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 15132 + }, + { + "epoch": 0.15133, + "grad_norm": 0.7060448342252426, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 15133 + }, + { + "epoch": 0.15134, + "grad_norm": 0.9177711985373354, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 15134 + }, + { + "epoch": 0.15135, + "grad_norm": 1.2631999758061152, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 15135 + }, + { + "epoch": 0.15136, + "grad_norm": 0.7457497717023199, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 15136 + }, + { + "epoch": 0.15137, + "grad_norm": 0.7275488284905349, + "learning_rate": 0.003, + "loss": 4.044, + "step": 15137 + }, + { + "epoch": 0.15138, + "grad_norm": 0.7659938351391803, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 15138 + }, + { + "epoch": 0.15139, + "grad_norm": 0.9471665683810248, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 15139 + }, + { + "epoch": 0.1514, + "grad_norm": 1.0644411698978926, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 15140 + }, + { + "epoch": 0.15141, + "grad_norm": 0.9911485071782313, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 15141 + }, + { + "epoch": 0.15142, + "grad_norm": 0.9177669091355584, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 15142 + }, + { + "epoch": 0.15143, + "grad_norm": 0.9539454491336194, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 15143 + }, + { + "epoch": 0.15144, + "grad_norm": 1.0190823580872017, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 15144 + }, + { + "epoch": 0.15145, + "grad_norm": 1.1097182863703374, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 15145 + }, + { + "epoch": 0.15146, + "grad_norm": 0.902108819088088, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 15146 + }, + { + "epoch": 0.15147, + "grad_norm": 0.8116744335567405, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 15147 + }, + { + "epoch": 0.15148, + "grad_norm": 0.8015320957439042, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 15148 + }, + { + "epoch": 0.15149, + "grad_norm": 0.8378619398388899, + "learning_rate": 0.003, + "loss": 4.091, + "step": 15149 + }, + { + "epoch": 0.1515, + "grad_norm": 0.8467095533469075, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 15150 + }, + { + "epoch": 0.15151, + "grad_norm": 0.9897372155878956, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 15151 + }, + { + "epoch": 0.15152, + "grad_norm": 0.9789423591472723, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 15152 + }, + { + "epoch": 0.15153, + "grad_norm": 1.0609979997095518, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 15153 + }, + { + "epoch": 0.15154, + "grad_norm": 1.3105194125792707, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 15154 + }, + { + "epoch": 0.15155, + "grad_norm": 0.9304455871903653, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 15155 + }, + { + "epoch": 0.15156, + "grad_norm": 0.8282275353630306, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 15156 + }, + { + "epoch": 0.15157, + "grad_norm": 0.9915551834570251, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 15157 + }, + { + "epoch": 0.15158, + "grad_norm": 1.0157859647007892, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 15158 + }, + { + "epoch": 0.15159, + "grad_norm": 1.033844815366442, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 15159 + }, + { + "epoch": 0.1516, + "grad_norm": 0.956169398803398, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 15160 + }, + { + "epoch": 0.15161, + "grad_norm": 1.0076553516889835, + "learning_rate": 0.003, + "loss": 4.077, + "step": 15161 + }, + { + "epoch": 0.15162, + "grad_norm": 1.125393989377539, + "learning_rate": 0.003, + "loss": 4.1165, + "step": 15162 + }, + { + "epoch": 0.15163, + "grad_norm": 0.8165843946971866, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 15163 + }, + { + "epoch": 0.15164, + "grad_norm": 0.7562017922440614, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 15164 + }, + { + "epoch": 0.15165, + "grad_norm": 0.7428884695099078, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 15165 + }, + { + "epoch": 0.15166, + "grad_norm": 0.8404484768990306, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 15166 + }, + { + "epoch": 0.15167, + "grad_norm": 0.8993097791945752, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 15167 + }, + { + "epoch": 0.15168, + "grad_norm": 0.9380835480998397, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 15168 + }, + { + "epoch": 0.15169, + "grad_norm": 1.0806486544246405, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 15169 + }, + { + "epoch": 0.1517, + "grad_norm": 0.9673967332511697, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 15170 + }, + { + "epoch": 0.15171, + "grad_norm": 0.7531630984905995, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 15171 + }, + { + "epoch": 0.15172, + "grad_norm": 0.7011234156390739, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 15172 + }, + { + "epoch": 0.15173, + "grad_norm": 0.6693307433636371, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 15173 + }, + { + "epoch": 0.15174, + "grad_norm": 0.7130280441231969, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 15174 + }, + { + "epoch": 0.15175, + "grad_norm": 0.7215414438726487, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 15175 + }, + { + "epoch": 0.15176, + "grad_norm": 0.6033413933498546, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 15176 + }, + { + "epoch": 0.15177, + "grad_norm": 0.6508264593081992, + "learning_rate": 0.003, + "loss": 4.053, + "step": 15177 + }, + { + "epoch": 0.15178, + "grad_norm": 0.8675552582776156, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 15178 + }, + { + "epoch": 0.15179, + "grad_norm": 1.0419533953805116, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 15179 + }, + { + "epoch": 0.1518, + "grad_norm": 1.0854078659240491, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 15180 + }, + { + "epoch": 0.15181, + "grad_norm": 0.7928904816498591, + "learning_rate": 0.003, + "loss": 4.05, + "step": 15181 + }, + { + "epoch": 0.15182, + "grad_norm": 0.6830974320765402, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 15182 + }, + { + "epoch": 0.15183, + "grad_norm": 0.7399601288407891, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 15183 + }, + { + "epoch": 0.15184, + "grad_norm": 0.7855594409722747, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 15184 + }, + { + "epoch": 0.15185, + "grad_norm": 0.9248773640331247, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 15185 + }, + { + "epoch": 0.15186, + "grad_norm": 0.9862123419049738, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 15186 + }, + { + "epoch": 0.15187, + "grad_norm": 1.102582252076831, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 15187 + }, + { + "epoch": 0.15188, + "grad_norm": 0.7775029083333368, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 15188 + }, + { + "epoch": 0.15189, + "grad_norm": 0.6506700585859129, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 15189 + }, + { + "epoch": 0.1519, + "grad_norm": 0.6796661020644874, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 15190 + }, + { + "epoch": 0.15191, + "grad_norm": 0.6760083572280191, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 15191 + }, + { + "epoch": 0.15192, + "grad_norm": 0.7339459038360747, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 15192 + }, + { + "epoch": 0.15193, + "grad_norm": 0.8568832935336129, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 15193 + }, + { + "epoch": 0.15194, + "grad_norm": 1.1195764067073293, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 15194 + }, + { + "epoch": 0.15195, + "grad_norm": 1.1094324595718625, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 15195 + }, + { + "epoch": 0.15196, + "grad_norm": 0.8664847142200764, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 15196 + }, + { + "epoch": 0.15197, + "grad_norm": 0.8739867532656782, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 15197 + }, + { + "epoch": 0.15198, + "grad_norm": 0.7545970355300466, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 15198 + }, + { + "epoch": 0.15199, + "grad_norm": 0.8349321201361193, + "learning_rate": 0.003, + "loss": 4.075, + "step": 15199 + }, + { + "epoch": 0.152, + "grad_norm": 0.7981624058949888, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 15200 + }, + { + "epoch": 0.15201, + "grad_norm": 0.7709086415901311, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 15201 + }, + { + "epoch": 0.15202, + "grad_norm": 0.8511088125328431, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 15202 + }, + { + "epoch": 0.15203, + "grad_norm": 0.9486382759577987, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 15203 + }, + { + "epoch": 0.15204, + "grad_norm": 1.0585494769383783, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 15204 + }, + { + "epoch": 0.15205, + "grad_norm": 1.0727065049309727, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 15205 + }, + { + "epoch": 0.15206, + "grad_norm": 1.0075358371153782, + "learning_rate": 0.003, + "loss": 4.059, + "step": 15206 + }, + { + "epoch": 0.15207, + "grad_norm": 0.9114685142048955, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 15207 + }, + { + "epoch": 0.15208, + "grad_norm": 0.8427218720162823, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 15208 + }, + { + "epoch": 0.15209, + "grad_norm": 0.7520032644693868, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 15209 + }, + { + "epoch": 0.1521, + "grad_norm": 0.8301931618735977, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 15210 + }, + { + "epoch": 0.15211, + "grad_norm": 1.126954103542896, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 15211 + }, + { + "epoch": 0.15212, + "grad_norm": 1.076102076696635, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 15212 + }, + { + "epoch": 0.15213, + "grad_norm": 0.942643678682756, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 15213 + }, + { + "epoch": 0.15214, + "grad_norm": 0.9153046076681154, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 15214 + }, + { + "epoch": 0.15215, + "grad_norm": 0.8335209604456619, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 15215 + }, + { + "epoch": 0.15216, + "grad_norm": 0.6990706248717341, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 15216 + }, + { + "epoch": 0.15217, + "grad_norm": 0.6654491730735902, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 15217 + }, + { + "epoch": 0.15218, + "grad_norm": 0.7229603138423157, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 15218 + }, + { + "epoch": 0.15219, + "grad_norm": 0.8142168305882189, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 15219 + }, + { + "epoch": 0.1522, + "grad_norm": 0.9415591426472244, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 15220 + }, + { + "epoch": 0.15221, + "grad_norm": 0.9994658536708769, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 15221 + }, + { + "epoch": 0.15222, + "grad_norm": 1.0465433905293442, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 15222 + }, + { + "epoch": 0.15223, + "grad_norm": 0.9197723655156945, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 15223 + }, + { + "epoch": 0.15224, + "grad_norm": 0.8735843811577606, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 15224 + }, + { + "epoch": 0.15225, + "grad_norm": 0.8178200665499099, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 15225 + }, + { + "epoch": 0.15226, + "grad_norm": 0.8322703784874306, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 15226 + }, + { + "epoch": 0.15227, + "grad_norm": 0.9952331935687299, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 15227 + }, + { + "epoch": 0.15228, + "grad_norm": 1.3104611198973144, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 15228 + }, + { + "epoch": 0.15229, + "grad_norm": 0.6856977527170464, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 15229 + }, + { + "epoch": 0.1523, + "grad_norm": 0.7001782128719186, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 15230 + }, + { + "epoch": 0.15231, + "grad_norm": 0.8286781366001644, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 15231 + }, + { + "epoch": 0.15232, + "grad_norm": 1.0706370569707482, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 15232 + }, + { + "epoch": 0.15233, + "grad_norm": 1.1556593051225217, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 15233 + }, + { + "epoch": 0.15234, + "grad_norm": 0.6819453233243812, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 15234 + }, + { + "epoch": 0.15235, + "grad_norm": 0.6612352239163759, + "learning_rate": 0.003, + "loss": 4.053, + "step": 15235 + }, + { + "epoch": 0.15236, + "grad_norm": 0.773020101727993, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 15236 + }, + { + "epoch": 0.15237, + "grad_norm": 0.8525464474951641, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 15237 + }, + { + "epoch": 0.15238, + "grad_norm": 0.8523068065969795, + "learning_rate": 0.003, + "loss": 4.046, + "step": 15238 + }, + { + "epoch": 0.15239, + "grad_norm": 0.900599962097635, + "learning_rate": 0.003, + "loss": 4.054, + "step": 15239 + }, + { + "epoch": 0.1524, + "grad_norm": 0.991414851124613, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 15240 + }, + { + "epoch": 0.15241, + "grad_norm": 1.107188195389757, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 15241 + }, + { + "epoch": 0.15242, + "grad_norm": 0.8704800607026462, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 15242 + }, + { + "epoch": 0.15243, + "grad_norm": 0.7404929086328644, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 15243 + }, + { + "epoch": 0.15244, + "grad_norm": 0.7703572874716674, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 15244 + }, + { + "epoch": 0.15245, + "grad_norm": 0.7829666178006482, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 15245 + }, + { + "epoch": 0.15246, + "grad_norm": 0.7415765331039469, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 15246 + }, + { + "epoch": 0.15247, + "grad_norm": 0.7924097397188576, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 15247 + }, + { + "epoch": 0.15248, + "grad_norm": 0.7777784226768076, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 15248 + }, + { + "epoch": 0.15249, + "grad_norm": 0.6998405033690062, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 15249 + }, + { + "epoch": 0.1525, + "grad_norm": 0.6404729897707369, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 15250 + }, + { + "epoch": 0.15251, + "grad_norm": 0.7452709958927278, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 15251 + }, + { + "epoch": 0.15252, + "grad_norm": 0.7940589446847499, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 15252 + }, + { + "epoch": 0.15253, + "grad_norm": 0.9481448417893192, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 15253 + }, + { + "epoch": 0.15254, + "grad_norm": 0.9766521625518498, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 15254 + }, + { + "epoch": 0.15255, + "grad_norm": 1.1123993025345629, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 15255 + }, + { + "epoch": 0.15256, + "grad_norm": 1.2492375752632872, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 15256 + }, + { + "epoch": 0.15257, + "grad_norm": 0.837986899539563, + "learning_rate": 0.003, + "loss": 4.058, + "step": 15257 + }, + { + "epoch": 0.15258, + "grad_norm": 0.7909502973323723, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 15258 + }, + { + "epoch": 0.15259, + "grad_norm": 0.7890035720589068, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 15259 + }, + { + "epoch": 0.1526, + "grad_norm": 0.8224144675886454, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 15260 + }, + { + "epoch": 0.15261, + "grad_norm": 0.8060568569256621, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 15261 + }, + { + "epoch": 0.15262, + "grad_norm": 0.8176292865252134, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 15262 + }, + { + "epoch": 0.15263, + "grad_norm": 0.8369337185990806, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 15263 + }, + { + "epoch": 0.15264, + "grad_norm": 0.8485265208416527, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 15264 + }, + { + "epoch": 0.15265, + "grad_norm": 0.8305809739761414, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 15265 + }, + { + "epoch": 0.15266, + "grad_norm": 0.8639285814044848, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 15266 + }, + { + "epoch": 0.15267, + "grad_norm": 0.773095963409235, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 15267 + }, + { + "epoch": 0.15268, + "grad_norm": 0.75198856269351, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 15268 + }, + { + "epoch": 0.15269, + "grad_norm": 0.7634260830024288, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 15269 + }, + { + "epoch": 0.1527, + "grad_norm": 0.7352566458153885, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 15270 + }, + { + "epoch": 0.15271, + "grad_norm": 0.6974783227375817, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 15271 + }, + { + "epoch": 0.15272, + "grad_norm": 0.7098188681579817, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 15272 + }, + { + "epoch": 0.15273, + "grad_norm": 1.000394283896658, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 15273 + }, + { + "epoch": 0.15274, + "grad_norm": 1.3197167826645684, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 15274 + }, + { + "epoch": 0.15275, + "grad_norm": 0.7384549959415955, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 15275 + }, + { + "epoch": 0.15276, + "grad_norm": 0.7163675883747808, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 15276 + }, + { + "epoch": 0.15277, + "grad_norm": 0.7950138913049991, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 15277 + }, + { + "epoch": 0.15278, + "grad_norm": 0.8092100122272277, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 15278 + }, + { + "epoch": 0.15279, + "grad_norm": 0.8240904170313513, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 15279 + }, + { + "epoch": 0.1528, + "grad_norm": 0.8183117073174399, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 15280 + }, + { + "epoch": 0.15281, + "grad_norm": 0.9165721380388888, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 15281 + }, + { + "epoch": 0.15282, + "grad_norm": 1.0164291672477201, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 15282 + }, + { + "epoch": 0.15283, + "grad_norm": 1.07127527329947, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 15283 + }, + { + "epoch": 0.15284, + "grad_norm": 1.056796739288713, + "learning_rate": 0.003, + "loss": 4.081, + "step": 15284 + }, + { + "epoch": 0.15285, + "grad_norm": 1.0656942720152707, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 15285 + }, + { + "epoch": 0.15286, + "grad_norm": 1.0005962235424413, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 15286 + }, + { + "epoch": 0.15287, + "grad_norm": 1.0357349400924054, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 15287 + }, + { + "epoch": 0.15288, + "grad_norm": 0.9635146374979597, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 15288 + }, + { + "epoch": 0.15289, + "grad_norm": 0.8736500322014659, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 15289 + }, + { + "epoch": 0.1529, + "grad_norm": 0.9360463166067774, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 15290 + }, + { + "epoch": 0.15291, + "grad_norm": 1.1124004238748042, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 15291 + }, + { + "epoch": 0.15292, + "grad_norm": 1.2161053720096802, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 15292 + }, + { + "epoch": 0.15293, + "grad_norm": 0.7816941440634604, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 15293 + }, + { + "epoch": 0.15294, + "grad_norm": 0.7438995370025482, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 15294 + }, + { + "epoch": 0.15295, + "grad_norm": 0.8139253736925429, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 15295 + }, + { + "epoch": 0.15296, + "grad_norm": 0.7653942018142526, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 15296 + }, + { + "epoch": 0.15297, + "grad_norm": 0.8419651678229138, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 15297 + }, + { + "epoch": 0.15298, + "grad_norm": 0.9039099983876641, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 15298 + }, + { + "epoch": 0.15299, + "grad_norm": 0.9237047675124964, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 15299 + }, + { + "epoch": 0.153, + "grad_norm": 0.9831123579516327, + "learning_rate": 0.003, + "loss": 4.101, + "step": 15300 + }, + { + "epoch": 0.15301, + "grad_norm": 1.0398480708757667, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 15301 + }, + { + "epoch": 0.15302, + "grad_norm": 0.9210196022719573, + "learning_rate": 0.003, + "loss": 4.045, + "step": 15302 + }, + { + "epoch": 0.15303, + "grad_norm": 0.8574354152970236, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 15303 + }, + { + "epoch": 0.15304, + "grad_norm": 0.9279757904048951, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 15304 + }, + { + "epoch": 0.15305, + "grad_norm": 0.9907124836345076, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 15305 + }, + { + "epoch": 0.15306, + "grad_norm": 0.9772938976980835, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 15306 + }, + { + "epoch": 0.15307, + "grad_norm": 0.8269054411701829, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 15307 + }, + { + "epoch": 0.15308, + "grad_norm": 0.7824618857627942, + "learning_rate": 0.003, + "loss": 4.08, + "step": 15308 + }, + { + "epoch": 0.15309, + "grad_norm": 0.845367517646765, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 15309 + }, + { + "epoch": 0.1531, + "grad_norm": 0.9322488255292629, + "learning_rate": 0.003, + "loss": 4.09, + "step": 15310 + }, + { + "epoch": 0.15311, + "grad_norm": 0.9918762112315062, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 15311 + }, + { + "epoch": 0.15312, + "grad_norm": 0.9692327736408248, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 15312 + }, + { + "epoch": 0.15313, + "grad_norm": 1.0406673818535623, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 15313 + }, + { + "epoch": 0.15314, + "grad_norm": 0.9548435951190011, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 15314 + }, + { + "epoch": 0.15315, + "grad_norm": 0.8409777942134875, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 15315 + }, + { + "epoch": 0.15316, + "grad_norm": 0.788125148875671, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 15316 + }, + { + "epoch": 0.15317, + "grad_norm": 0.8256558652012488, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 15317 + }, + { + "epoch": 0.15318, + "grad_norm": 0.8197919997498069, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 15318 + }, + { + "epoch": 0.15319, + "grad_norm": 0.8465533516485063, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 15319 + }, + { + "epoch": 0.1532, + "grad_norm": 0.8174693643961777, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 15320 + }, + { + "epoch": 0.15321, + "grad_norm": 0.7909793362987472, + "learning_rate": 0.003, + "loss": 4.071, + "step": 15321 + }, + { + "epoch": 0.15322, + "grad_norm": 0.8215613196846124, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 15322 + }, + { + "epoch": 0.15323, + "grad_norm": 0.8116288700030023, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 15323 + }, + { + "epoch": 0.15324, + "grad_norm": 0.7705645852664121, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 15324 + }, + { + "epoch": 0.15325, + "grad_norm": 0.7586459276769124, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 15325 + }, + { + "epoch": 0.15326, + "grad_norm": 0.8214815946327116, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 15326 + }, + { + "epoch": 0.15327, + "grad_norm": 0.859227403114001, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 15327 + }, + { + "epoch": 0.15328, + "grad_norm": 0.9247243898742153, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 15328 + }, + { + "epoch": 0.15329, + "grad_norm": 1.0070150338670758, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 15329 + }, + { + "epoch": 0.1533, + "grad_norm": 1.048426345837903, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 15330 + }, + { + "epoch": 0.15331, + "grad_norm": 0.8691143494307864, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 15331 + }, + { + "epoch": 0.15332, + "grad_norm": 0.7418188611393264, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 15332 + }, + { + "epoch": 0.15333, + "grad_norm": 0.7223747846535831, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 15333 + }, + { + "epoch": 0.15334, + "grad_norm": 0.7181767443064082, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 15334 + }, + { + "epoch": 0.15335, + "grad_norm": 0.9082406051597949, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 15335 + }, + { + "epoch": 0.15336, + "grad_norm": 1.0659744240190578, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 15336 + }, + { + "epoch": 0.15337, + "grad_norm": 0.8095698019551361, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 15337 + }, + { + "epoch": 0.15338, + "grad_norm": 0.7868786717459086, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 15338 + }, + { + "epoch": 0.15339, + "grad_norm": 0.8599956330432876, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 15339 + }, + { + "epoch": 0.1534, + "grad_norm": 0.7706506291807872, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 15340 + }, + { + "epoch": 0.15341, + "grad_norm": 0.8721086785843887, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 15341 + }, + { + "epoch": 0.15342, + "grad_norm": 1.0597943106109282, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 15342 + }, + { + "epoch": 0.15343, + "grad_norm": 1.0315648440749439, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 15343 + }, + { + "epoch": 0.15344, + "grad_norm": 1.0445752058541913, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 15344 + }, + { + "epoch": 0.15345, + "grad_norm": 1.126879153111509, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 15345 + }, + { + "epoch": 0.15346, + "grad_norm": 0.8758151127517635, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 15346 + }, + { + "epoch": 0.15347, + "grad_norm": 0.8887417297013118, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 15347 + }, + { + "epoch": 0.15348, + "grad_norm": 0.9078925324284505, + "learning_rate": 0.003, + "loss": 4.086, + "step": 15348 + }, + { + "epoch": 0.15349, + "grad_norm": 0.9437893858843688, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 15349 + }, + { + "epoch": 0.1535, + "grad_norm": 1.028551586928702, + "learning_rate": 0.003, + "loss": 4.083, + "step": 15350 + }, + { + "epoch": 0.15351, + "grad_norm": 0.9838643587197328, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 15351 + }, + { + "epoch": 0.15352, + "grad_norm": 1.0179072480536766, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 15352 + }, + { + "epoch": 0.15353, + "grad_norm": 1.0587174480745536, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 15353 + }, + { + "epoch": 0.15354, + "grad_norm": 0.7937917540579997, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 15354 + }, + { + "epoch": 0.15355, + "grad_norm": 0.7618401905739632, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 15355 + }, + { + "epoch": 0.15356, + "grad_norm": 0.7476017996224481, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 15356 + }, + { + "epoch": 0.15357, + "grad_norm": 0.7168312754858822, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 15357 + }, + { + "epoch": 0.15358, + "grad_norm": 0.6652411138469404, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 15358 + }, + { + "epoch": 0.15359, + "grad_norm": 0.5990865842852017, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 15359 + }, + { + "epoch": 0.1536, + "grad_norm": 0.610952128432108, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 15360 + }, + { + "epoch": 0.15361, + "grad_norm": 0.5850669319398033, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 15361 + }, + { + "epoch": 0.15362, + "grad_norm": 0.7146752691579432, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 15362 + }, + { + "epoch": 0.15363, + "grad_norm": 0.8886228198316868, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 15363 + }, + { + "epoch": 0.15364, + "grad_norm": 0.9598647006128489, + "learning_rate": 0.003, + "loss": 4.061, + "step": 15364 + }, + { + "epoch": 0.15365, + "grad_norm": 0.9996920584638711, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 15365 + }, + { + "epoch": 0.15366, + "grad_norm": 1.0729749653640965, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 15366 + }, + { + "epoch": 0.15367, + "grad_norm": 0.8812989363422271, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 15367 + }, + { + "epoch": 0.15368, + "grad_norm": 0.9612399829762662, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 15368 + }, + { + "epoch": 0.15369, + "grad_norm": 1.2373738936147978, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 15369 + }, + { + "epoch": 0.1537, + "grad_norm": 1.0254891506435038, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 15370 + }, + { + "epoch": 0.15371, + "grad_norm": 0.8992332799160001, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 15371 + }, + { + "epoch": 0.15372, + "grad_norm": 0.8148899378746298, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 15372 + }, + { + "epoch": 0.15373, + "grad_norm": 0.8704590629676359, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 15373 + }, + { + "epoch": 0.15374, + "grad_norm": 1.0363916590663378, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 15374 + }, + { + "epoch": 0.15375, + "grad_norm": 0.8676314135519507, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 15375 + }, + { + "epoch": 0.15376, + "grad_norm": 0.7084009364673541, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 15376 + }, + { + "epoch": 0.15377, + "grad_norm": 0.698532730339605, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 15377 + }, + { + "epoch": 0.15378, + "grad_norm": 0.6100615528067144, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 15378 + }, + { + "epoch": 0.15379, + "grad_norm": 0.6477153996238283, + "learning_rate": 0.003, + "loss": 4.091, + "step": 15379 + }, + { + "epoch": 0.1538, + "grad_norm": 0.6877543332267829, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 15380 + }, + { + "epoch": 0.15381, + "grad_norm": 0.6537584265239311, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 15381 + }, + { + "epoch": 0.15382, + "grad_norm": 0.763424128315055, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 15382 + }, + { + "epoch": 0.15383, + "grad_norm": 0.8125123084912943, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 15383 + }, + { + "epoch": 0.15384, + "grad_norm": 0.7606723703046439, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 15384 + }, + { + "epoch": 0.15385, + "grad_norm": 0.6884466700203843, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 15385 + }, + { + "epoch": 0.15386, + "grad_norm": 0.6467478719710251, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 15386 + }, + { + "epoch": 0.15387, + "grad_norm": 0.6245316876044793, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 15387 + }, + { + "epoch": 0.15388, + "grad_norm": 0.6356434538542645, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 15388 + }, + { + "epoch": 0.15389, + "grad_norm": 0.6373822402222443, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 15389 + }, + { + "epoch": 0.1539, + "grad_norm": 0.7718510561746118, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 15390 + }, + { + "epoch": 0.15391, + "grad_norm": 0.9061684310137172, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 15391 + }, + { + "epoch": 0.15392, + "grad_norm": 1.1775563525898003, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 15392 + }, + { + "epoch": 0.15393, + "grad_norm": 0.9732620045236304, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 15393 + }, + { + "epoch": 0.15394, + "grad_norm": 0.9950541024320676, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 15394 + }, + { + "epoch": 0.15395, + "grad_norm": 0.9328073183592768, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 15395 + }, + { + "epoch": 0.15396, + "grad_norm": 1.0172921681937264, + "learning_rate": 0.003, + "loss": 4.045, + "step": 15396 + }, + { + "epoch": 0.15397, + "grad_norm": 1.0314286160597368, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 15397 + }, + { + "epoch": 0.15398, + "grad_norm": 1.1131646114895066, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 15398 + }, + { + "epoch": 0.15399, + "grad_norm": 0.8061905602399027, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 15399 + }, + { + "epoch": 0.154, + "grad_norm": 0.7281692435789039, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 15400 + }, + { + "epoch": 0.15401, + "grad_norm": 0.7136606728727093, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 15401 + }, + { + "epoch": 0.15402, + "grad_norm": 0.7044475162085178, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 15402 + }, + { + "epoch": 0.15403, + "grad_norm": 0.7620517403211995, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 15403 + }, + { + "epoch": 0.15404, + "grad_norm": 0.8943543003820128, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 15404 + }, + { + "epoch": 0.15405, + "grad_norm": 1.0634772629309255, + "learning_rate": 0.003, + "loss": 4.1146, + "step": 15405 + }, + { + "epoch": 0.15406, + "grad_norm": 1.0489995344298069, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 15406 + }, + { + "epoch": 0.15407, + "grad_norm": 0.936834227930002, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 15407 + }, + { + "epoch": 0.15408, + "grad_norm": 0.8968233810962296, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 15408 + }, + { + "epoch": 0.15409, + "grad_norm": 0.8650125405473275, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 15409 + }, + { + "epoch": 0.1541, + "grad_norm": 0.8351046721298193, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 15410 + }, + { + "epoch": 0.15411, + "grad_norm": 0.8456992735792717, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 15411 + }, + { + "epoch": 0.15412, + "grad_norm": 0.9616974588856743, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 15412 + }, + { + "epoch": 0.15413, + "grad_norm": 1.228338969955763, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 15413 + }, + { + "epoch": 0.15414, + "grad_norm": 0.9327203029167266, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 15414 + }, + { + "epoch": 0.15415, + "grad_norm": 1.0369311853927972, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 15415 + }, + { + "epoch": 0.15416, + "grad_norm": 1.1884505990050545, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 15416 + }, + { + "epoch": 0.15417, + "grad_norm": 0.9161430344272499, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 15417 + }, + { + "epoch": 0.15418, + "grad_norm": 1.091616793433311, + "learning_rate": 0.003, + "loss": 4.051, + "step": 15418 + }, + { + "epoch": 0.15419, + "grad_norm": 1.1469923028891025, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 15419 + }, + { + "epoch": 0.1542, + "grad_norm": 0.8134990180415536, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 15420 + }, + { + "epoch": 0.15421, + "grad_norm": 0.749698599058815, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 15421 + }, + { + "epoch": 0.15422, + "grad_norm": 0.7126708577307878, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 15422 + }, + { + "epoch": 0.15423, + "grad_norm": 0.6560223291309407, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 15423 + }, + { + "epoch": 0.15424, + "grad_norm": 0.6434394591799395, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 15424 + }, + { + "epoch": 0.15425, + "grad_norm": 0.7214371154521346, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 15425 + }, + { + "epoch": 0.15426, + "grad_norm": 0.7707324717352958, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 15426 + }, + { + "epoch": 0.15427, + "grad_norm": 0.9784461671732944, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 15427 + }, + { + "epoch": 0.15428, + "grad_norm": 1.0751486596194002, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 15428 + }, + { + "epoch": 0.15429, + "grad_norm": 0.9342153187863622, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 15429 + }, + { + "epoch": 0.1543, + "grad_norm": 0.9635213215700588, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 15430 + }, + { + "epoch": 0.15431, + "grad_norm": 0.9691929750211792, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 15431 + }, + { + "epoch": 0.15432, + "grad_norm": 0.857186961871027, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 15432 + }, + { + "epoch": 0.15433, + "grad_norm": 0.8988001941635466, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 15433 + }, + { + "epoch": 0.15434, + "grad_norm": 0.9092065583290543, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 15434 + }, + { + "epoch": 0.15435, + "grad_norm": 1.0763532230832278, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 15435 + }, + { + "epoch": 0.15436, + "grad_norm": 0.9015359583839574, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 15436 + }, + { + "epoch": 0.15437, + "grad_norm": 0.8159108758688398, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 15437 + }, + { + "epoch": 0.15438, + "grad_norm": 0.8669191192028983, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 15438 + }, + { + "epoch": 0.15439, + "grad_norm": 0.8622772622890306, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 15439 + }, + { + "epoch": 0.1544, + "grad_norm": 0.862024516715705, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 15440 + }, + { + "epoch": 0.15441, + "grad_norm": 1.0019341765054546, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 15441 + }, + { + "epoch": 0.15442, + "grad_norm": 1.0460953698286313, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 15442 + }, + { + "epoch": 0.15443, + "grad_norm": 0.9056468395580206, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 15443 + }, + { + "epoch": 0.15444, + "grad_norm": 0.8762762068790201, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 15444 + }, + { + "epoch": 0.15445, + "grad_norm": 0.9318923437666932, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 15445 + }, + { + "epoch": 0.15446, + "grad_norm": 1.0039644072078817, + "learning_rate": 0.003, + "loss": 4.06, + "step": 15446 + }, + { + "epoch": 0.15447, + "grad_norm": 1.1812029223095382, + "learning_rate": 0.003, + "loss": 4.083, + "step": 15447 + }, + { + "epoch": 0.15448, + "grad_norm": 0.7460119650318848, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 15448 + }, + { + "epoch": 0.15449, + "grad_norm": 0.6190852095293629, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 15449 + }, + { + "epoch": 0.1545, + "grad_norm": 0.7711414583657713, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 15450 + }, + { + "epoch": 0.15451, + "grad_norm": 0.8699766039956232, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 15451 + }, + { + "epoch": 0.15452, + "grad_norm": 0.918779270026279, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 15452 + }, + { + "epoch": 0.15453, + "grad_norm": 1.0578539680005319, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 15453 + }, + { + "epoch": 0.15454, + "grad_norm": 1.1553769043442381, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 15454 + }, + { + "epoch": 0.15455, + "grad_norm": 0.8973035021789558, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 15455 + }, + { + "epoch": 0.15456, + "grad_norm": 0.8607516540225554, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 15456 + }, + { + "epoch": 0.15457, + "grad_norm": 0.8289164207339367, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 15457 + }, + { + "epoch": 0.15458, + "grad_norm": 0.7399216084926746, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 15458 + }, + { + "epoch": 0.15459, + "grad_norm": 0.6263654591934297, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 15459 + }, + { + "epoch": 0.1546, + "grad_norm": 0.6189360858960318, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 15460 + }, + { + "epoch": 0.15461, + "grad_norm": 0.6654599519087065, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 15461 + }, + { + "epoch": 0.15462, + "grad_norm": 0.6672983694893821, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 15462 + }, + { + "epoch": 0.15463, + "grad_norm": 0.7409763602890124, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 15463 + }, + { + "epoch": 0.15464, + "grad_norm": 0.7518234475164591, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 15464 + }, + { + "epoch": 0.15465, + "grad_norm": 0.8086846319741519, + "learning_rate": 0.003, + "loss": 4.046, + "step": 15465 + }, + { + "epoch": 0.15466, + "grad_norm": 0.8869002598209089, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 15466 + }, + { + "epoch": 0.15467, + "grad_norm": 1.1329410550211505, + "learning_rate": 0.003, + "loss": 4.069, + "step": 15467 + }, + { + "epoch": 0.15468, + "grad_norm": 1.0645004856880826, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 15468 + }, + { + "epoch": 0.15469, + "grad_norm": 1.0267216422241157, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 15469 + }, + { + "epoch": 0.1547, + "grad_norm": 0.9149964881198352, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 15470 + }, + { + "epoch": 0.15471, + "grad_norm": 0.8287218209214179, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 15471 + }, + { + "epoch": 0.15472, + "grad_norm": 0.7796472725168954, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 15472 + }, + { + "epoch": 0.15473, + "grad_norm": 0.7761389980881427, + "learning_rate": 0.003, + "loss": 4.084, + "step": 15473 + }, + { + "epoch": 0.15474, + "grad_norm": 0.7270120134919071, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 15474 + }, + { + "epoch": 0.15475, + "grad_norm": 0.7763197507518927, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 15475 + }, + { + "epoch": 0.15476, + "grad_norm": 0.7674069391230546, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 15476 + }, + { + "epoch": 0.15477, + "grad_norm": 0.7647874309242547, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 15477 + }, + { + "epoch": 0.15478, + "grad_norm": 0.7402283439445141, + "learning_rate": 0.003, + "loss": 4.056, + "step": 15478 + }, + { + "epoch": 0.15479, + "grad_norm": 0.7704264106087683, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 15479 + }, + { + "epoch": 0.1548, + "grad_norm": 0.8739289097868363, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 15480 + }, + { + "epoch": 0.15481, + "grad_norm": 1.1263533864300832, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 15481 + }, + { + "epoch": 0.15482, + "grad_norm": 0.9525460025294208, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 15482 + }, + { + "epoch": 0.15483, + "grad_norm": 0.9333168866468142, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 15483 + }, + { + "epoch": 0.15484, + "grad_norm": 0.9730324058784258, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 15484 + }, + { + "epoch": 0.15485, + "grad_norm": 0.909897383863647, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 15485 + }, + { + "epoch": 0.15486, + "grad_norm": 0.8009061464324777, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 15486 + }, + { + "epoch": 0.15487, + "grad_norm": 0.6404602622859169, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 15487 + }, + { + "epoch": 0.15488, + "grad_norm": 0.6460085282699142, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 15488 + }, + { + "epoch": 0.15489, + "grad_norm": 0.6917772168227212, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 15489 + }, + { + "epoch": 0.1549, + "grad_norm": 0.7452533445497697, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 15490 + }, + { + "epoch": 0.15491, + "grad_norm": 0.74914215694899, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 15491 + }, + { + "epoch": 0.15492, + "grad_norm": 0.6500003700792345, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 15492 + }, + { + "epoch": 0.15493, + "grad_norm": 0.7208607957872774, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 15493 + }, + { + "epoch": 0.15494, + "grad_norm": 0.7954539482905564, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 15494 + }, + { + "epoch": 0.15495, + "grad_norm": 1.1448476120062712, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 15495 + }, + { + "epoch": 0.15496, + "grad_norm": 1.0429540905268704, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 15496 + }, + { + "epoch": 0.15497, + "grad_norm": 0.9922058860462024, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 15497 + }, + { + "epoch": 0.15498, + "grad_norm": 1.0374318456690717, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 15498 + }, + { + "epoch": 0.15499, + "grad_norm": 0.850225728016009, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 15499 + }, + { + "epoch": 0.155, + "grad_norm": 0.823741912085178, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 15500 + }, + { + "epoch": 0.15501, + "grad_norm": 0.9238563261828668, + "learning_rate": 0.003, + "loss": 4.057, + "step": 15501 + }, + { + "epoch": 0.15502, + "grad_norm": 1.1054316400914197, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 15502 + }, + { + "epoch": 0.15503, + "grad_norm": 1.0008061464778677, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 15503 + }, + { + "epoch": 0.15504, + "grad_norm": 1.1001959250202507, + "learning_rate": 0.003, + "loss": 4.091, + "step": 15504 + }, + { + "epoch": 0.15505, + "grad_norm": 0.8872338983350687, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 15505 + }, + { + "epoch": 0.15506, + "grad_norm": 0.9467325459884459, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 15506 + }, + { + "epoch": 0.15507, + "grad_norm": 1.009616961840951, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 15507 + }, + { + "epoch": 0.15508, + "grad_norm": 1.1703253045225837, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 15508 + }, + { + "epoch": 0.15509, + "grad_norm": 0.9910258078543399, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 15509 + }, + { + "epoch": 0.1551, + "grad_norm": 0.8590018006910455, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 15510 + }, + { + "epoch": 0.15511, + "grad_norm": 0.8221040719720489, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 15511 + }, + { + "epoch": 0.15512, + "grad_norm": 0.8550604390636547, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 15512 + }, + { + "epoch": 0.15513, + "grad_norm": 0.8035956560395793, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 15513 + }, + { + "epoch": 0.15514, + "grad_norm": 0.9233044007204195, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 15514 + }, + { + "epoch": 0.15515, + "grad_norm": 0.9381953022868085, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 15515 + }, + { + "epoch": 0.15516, + "grad_norm": 1.039866091398066, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 15516 + }, + { + "epoch": 0.15517, + "grad_norm": 1.170461779810142, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 15517 + }, + { + "epoch": 0.15518, + "grad_norm": 0.8593146424575652, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 15518 + }, + { + "epoch": 0.15519, + "grad_norm": 0.8535163044475705, + "learning_rate": 0.003, + "loss": 4.055, + "step": 15519 + }, + { + "epoch": 0.1552, + "grad_norm": 0.8908455444384552, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 15520 + }, + { + "epoch": 0.15521, + "grad_norm": 0.9530158401911695, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 15521 + }, + { + "epoch": 0.15522, + "grad_norm": 1.0710813249073143, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 15522 + }, + { + "epoch": 0.15523, + "grad_norm": 0.9547514449675653, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 15523 + }, + { + "epoch": 0.15524, + "grad_norm": 0.9708381670048761, + "learning_rate": 0.003, + "loss": 4.092, + "step": 15524 + }, + { + "epoch": 0.15525, + "grad_norm": 0.9281972432170034, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 15525 + }, + { + "epoch": 0.15526, + "grad_norm": 0.9452754971493649, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 15526 + }, + { + "epoch": 0.15527, + "grad_norm": 1.1336347797235296, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 15527 + }, + { + "epoch": 0.15528, + "grad_norm": 0.9822485734434496, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 15528 + }, + { + "epoch": 0.15529, + "grad_norm": 0.9144468853148353, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 15529 + }, + { + "epoch": 0.1553, + "grad_norm": 0.9621912015091932, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 15530 + }, + { + "epoch": 0.15531, + "grad_norm": 0.834701156342796, + "learning_rate": 0.003, + "loss": 4.064, + "step": 15531 + }, + { + "epoch": 0.15532, + "grad_norm": 0.7038072073156139, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 15532 + }, + { + "epoch": 0.15533, + "grad_norm": 0.7864258445634408, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 15533 + }, + { + "epoch": 0.15534, + "grad_norm": 0.8498937190027228, + "learning_rate": 0.003, + "loss": 4.091, + "step": 15534 + }, + { + "epoch": 0.15535, + "grad_norm": 0.9988490320319162, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 15535 + }, + { + "epoch": 0.15536, + "grad_norm": 1.104970485487583, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 15536 + }, + { + "epoch": 0.15537, + "grad_norm": 0.8122962187221906, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 15537 + }, + { + "epoch": 0.15538, + "grad_norm": 0.6829837254445589, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 15538 + }, + { + "epoch": 0.15539, + "grad_norm": 0.6239711604895152, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 15539 + }, + { + "epoch": 0.1554, + "grad_norm": 0.5897056435055278, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 15540 + }, + { + "epoch": 0.15541, + "grad_norm": 0.5551094437362032, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 15541 + }, + { + "epoch": 0.15542, + "grad_norm": 0.56511733964809, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 15542 + }, + { + "epoch": 0.15543, + "grad_norm": 0.6117786371457542, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 15543 + }, + { + "epoch": 0.15544, + "grad_norm": 0.7046444160464634, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 15544 + }, + { + "epoch": 0.15545, + "grad_norm": 0.9654374717196152, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 15545 + }, + { + "epoch": 0.15546, + "grad_norm": 1.2749271054755769, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 15546 + }, + { + "epoch": 0.15547, + "grad_norm": 0.745483960991596, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 15547 + }, + { + "epoch": 0.15548, + "grad_norm": 0.7855719930691469, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 15548 + }, + { + "epoch": 0.15549, + "grad_norm": 0.7601840785983824, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 15549 + }, + { + "epoch": 0.1555, + "grad_norm": 0.8142037516037431, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 15550 + }, + { + "epoch": 0.15551, + "grad_norm": 0.7949631614947104, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 15551 + }, + { + "epoch": 0.15552, + "grad_norm": 0.716858353732648, + "learning_rate": 0.003, + "loss": 4.046, + "step": 15552 + }, + { + "epoch": 0.15553, + "grad_norm": 0.8449742704018046, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 15553 + }, + { + "epoch": 0.15554, + "grad_norm": 0.9028674042410417, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 15554 + }, + { + "epoch": 0.15555, + "grad_norm": 1.0332359239049087, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 15555 + }, + { + "epoch": 0.15556, + "grad_norm": 1.3285988413244725, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 15556 + }, + { + "epoch": 0.15557, + "grad_norm": 0.82007769013011, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 15557 + }, + { + "epoch": 0.15558, + "grad_norm": 0.6894776810610859, + "learning_rate": 0.003, + "loss": 4.065, + "step": 15558 + }, + { + "epoch": 0.15559, + "grad_norm": 0.6316581580258641, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 15559 + }, + { + "epoch": 0.1556, + "grad_norm": 0.7116632678924181, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 15560 + }, + { + "epoch": 0.15561, + "grad_norm": 0.82837303506694, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 15561 + }, + { + "epoch": 0.15562, + "grad_norm": 0.9611383598681128, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 15562 + }, + { + "epoch": 0.15563, + "grad_norm": 1.092475113649714, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 15563 + }, + { + "epoch": 0.15564, + "grad_norm": 1.060393024021851, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 15564 + }, + { + "epoch": 0.15565, + "grad_norm": 1.0487472071725026, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 15565 + }, + { + "epoch": 0.15566, + "grad_norm": 0.9463545538800499, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 15566 + }, + { + "epoch": 0.15567, + "grad_norm": 0.7501415029738309, + "learning_rate": 0.003, + "loss": 4.061, + "step": 15567 + }, + { + "epoch": 0.15568, + "grad_norm": 0.6219431825249087, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 15568 + }, + { + "epoch": 0.15569, + "grad_norm": 0.6918215471451595, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 15569 + }, + { + "epoch": 0.1557, + "grad_norm": 0.8086146257580983, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 15570 + }, + { + "epoch": 0.15571, + "grad_norm": 0.8533713854330912, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 15571 + }, + { + "epoch": 0.15572, + "grad_norm": 0.8780257870236586, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 15572 + }, + { + "epoch": 0.15573, + "grad_norm": 0.8496333543849813, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 15573 + }, + { + "epoch": 0.15574, + "grad_norm": 0.9110526708886625, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 15574 + }, + { + "epoch": 0.15575, + "grad_norm": 1.015019738431502, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 15575 + }, + { + "epoch": 0.15576, + "grad_norm": 0.969549567680602, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 15576 + }, + { + "epoch": 0.15577, + "grad_norm": 0.9051056955696452, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 15577 + }, + { + "epoch": 0.15578, + "grad_norm": 0.964665349585722, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 15578 + }, + { + "epoch": 0.15579, + "grad_norm": 1.1679976408997053, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 15579 + }, + { + "epoch": 0.1558, + "grad_norm": 1.102628850605355, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 15580 + }, + { + "epoch": 0.15581, + "grad_norm": 0.9956977239385475, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 15581 + }, + { + "epoch": 0.15582, + "grad_norm": 1.097055256038559, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 15582 + }, + { + "epoch": 0.15583, + "grad_norm": 1.0646531800168875, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 15583 + }, + { + "epoch": 0.15584, + "grad_norm": 0.8760851156942279, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 15584 + }, + { + "epoch": 0.15585, + "grad_norm": 0.8943881993935535, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 15585 + }, + { + "epoch": 0.15586, + "grad_norm": 0.9190415541194136, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 15586 + }, + { + "epoch": 0.15587, + "grad_norm": 0.8556050544661058, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 15587 + }, + { + "epoch": 0.15588, + "grad_norm": 0.6945857287160594, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 15588 + }, + { + "epoch": 0.15589, + "grad_norm": 0.6471982189951477, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 15589 + }, + { + "epoch": 0.1559, + "grad_norm": 0.6662768618836037, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 15590 + }, + { + "epoch": 0.15591, + "grad_norm": 0.7599561337701207, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 15591 + }, + { + "epoch": 0.15592, + "grad_norm": 0.775076448494908, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 15592 + }, + { + "epoch": 0.15593, + "grad_norm": 0.9174076140493366, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 15593 + }, + { + "epoch": 0.15594, + "grad_norm": 1.2169025906499393, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 15594 + }, + { + "epoch": 0.15595, + "grad_norm": 0.9000408502979979, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 15595 + }, + { + "epoch": 0.15596, + "grad_norm": 0.6406303812655746, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 15596 + }, + { + "epoch": 0.15597, + "grad_norm": 0.7788255556972971, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 15597 + }, + { + "epoch": 0.15598, + "grad_norm": 1.0347369074474213, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 15598 + }, + { + "epoch": 0.15599, + "grad_norm": 1.035064376849458, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 15599 + }, + { + "epoch": 0.156, + "grad_norm": 0.9692133530209425, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 15600 + }, + { + "epoch": 0.15601, + "grad_norm": 1.0450323767106415, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 15601 + }, + { + "epoch": 0.15602, + "grad_norm": 1.0699803348981658, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 15602 + }, + { + "epoch": 0.15603, + "grad_norm": 1.0875426637442756, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 15603 + }, + { + "epoch": 0.15604, + "grad_norm": 0.8694194830067608, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 15604 + }, + { + "epoch": 0.15605, + "grad_norm": 0.8734698103010141, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 15605 + }, + { + "epoch": 0.15606, + "grad_norm": 0.9072438647082046, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 15606 + }, + { + "epoch": 0.15607, + "grad_norm": 0.8103647416641858, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 15607 + }, + { + "epoch": 0.15608, + "grad_norm": 0.7938669168603442, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 15608 + }, + { + "epoch": 0.15609, + "grad_norm": 0.751983038353267, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 15609 + }, + { + "epoch": 0.1561, + "grad_norm": 0.8291569967019005, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 15610 + }, + { + "epoch": 0.15611, + "grad_norm": 0.9380251030634215, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 15611 + }, + { + "epoch": 0.15612, + "grad_norm": 0.9915590564103034, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 15612 + }, + { + "epoch": 0.15613, + "grad_norm": 1.1092022208759647, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 15613 + }, + { + "epoch": 0.15614, + "grad_norm": 0.7807231605511251, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 15614 + }, + { + "epoch": 0.15615, + "grad_norm": 0.7924724999912204, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 15615 + }, + { + "epoch": 0.15616, + "grad_norm": 0.8434872336299037, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 15616 + }, + { + "epoch": 0.15617, + "grad_norm": 0.8657148855571658, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 15617 + }, + { + "epoch": 0.15618, + "grad_norm": 0.8000624026317716, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 15618 + }, + { + "epoch": 0.15619, + "grad_norm": 0.7572274328551856, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 15619 + }, + { + "epoch": 0.1562, + "grad_norm": 0.8103320823809439, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 15620 + }, + { + "epoch": 0.15621, + "grad_norm": 0.7878389534066891, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 15621 + }, + { + "epoch": 0.15622, + "grad_norm": 0.7443935066134619, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 15622 + }, + { + "epoch": 0.15623, + "grad_norm": 0.6766944161961428, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 15623 + }, + { + "epoch": 0.15624, + "grad_norm": 0.697385615345915, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 15624 + }, + { + "epoch": 0.15625, + "grad_norm": 0.8204428082539018, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 15625 + }, + { + "epoch": 0.15626, + "grad_norm": 0.8759598190282629, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 15626 + }, + { + "epoch": 0.15627, + "grad_norm": 0.9387044096573821, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 15627 + }, + { + "epoch": 0.15628, + "grad_norm": 1.3407919288523218, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 15628 + }, + { + "epoch": 0.15629, + "grad_norm": 0.8778475783479913, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 15629 + }, + { + "epoch": 0.1563, + "grad_norm": 0.8032525466694488, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 15630 + }, + { + "epoch": 0.15631, + "grad_norm": 0.7706529729065191, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 15631 + }, + { + "epoch": 0.15632, + "grad_norm": 0.777650827857467, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 15632 + }, + { + "epoch": 0.15633, + "grad_norm": 0.7507406733082125, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 15633 + }, + { + "epoch": 0.15634, + "grad_norm": 0.7526007500523764, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 15634 + }, + { + "epoch": 0.15635, + "grad_norm": 0.873807783654368, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 15635 + }, + { + "epoch": 0.15636, + "grad_norm": 1.082259618076844, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 15636 + }, + { + "epoch": 0.15637, + "grad_norm": 1.2133033360040355, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 15637 + }, + { + "epoch": 0.15638, + "grad_norm": 0.7981373134230838, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 15638 + }, + { + "epoch": 0.15639, + "grad_norm": 0.7395655181129475, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 15639 + }, + { + "epoch": 0.1564, + "grad_norm": 0.7191230219618001, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 15640 + }, + { + "epoch": 0.15641, + "grad_norm": 0.8915858040822598, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 15641 + }, + { + "epoch": 0.15642, + "grad_norm": 1.049270737940305, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 15642 + }, + { + "epoch": 0.15643, + "grad_norm": 1.0616572559274393, + "learning_rate": 0.003, + "loss": 4.092, + "step": 15643 + }, + { + "epoch": 0.15644, + "grad_norm": 1.1734717817211473, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 15644 + }, + { + "epoch": 0.15645, + "grad_norm": 0.908989516417746, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 15645 + }, + { + "epoch": 0.15646, + "grad_norm": 0.7782227095912202, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 15646 + }, + { + "epoch": 0.15647, + "grad_norm": 0.7433683006151823, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 15647 + }, + { + "epoch": 0.15648, + "grad_norm": 0.7168902425258895, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 15648 + }, + { + "epoch": 0.15649, + "grad_norm": 0.8293715696958915, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 15649 + }, + { + "epoch": 0.1565, + "grad_norm": 1.121938905967932, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 15650 + }, + { + "epoch": 0.15651, + "grad_norm": 1.1008889824817694, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 15651 + }, + { + "epoch": 0.15652, + "grad_norm": 0.9041408844798634, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 15652 + }, + { + "epoch": 0.15653, + "grad_norm": 0.9356524254061378, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 15653 + }, + { + "epoch": 0.15654, + "grad_norm": 1.017812118603101, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 15654 + }, + { + "epoch": 0.15655, + "grad_norm": 0.8849690125000396, + "learning_rate": 0.003, + "loss": 4.1142, + "step": 15655 + }, + { + "epoch": 0.15656, + "grad_norm": 0.80066641325765, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 15656 + }, + { + "epoch": 0.15657, + "grad_norm": 0.7720592171862735, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 15657 + }, + { + "epoch": 0.15658, + "grad_norm": 0.8310448609370841, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 15658 + }, + { + "epoch": 0.15659, + "grad_norm": 0.8911564350069221, + "learning_rate": 0.003, + "loss": 4.089, + "step": 15659 + }, + { + "epoch": 0.1566, + "grad_norm": 0.9011121850370314, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 15660 + }, + { + "epoch": 0.15661, + "grad_norm": 0.8308295448043017, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 15661 + }, + { + "epoch": 0.15662, + "grad_norm": 0.8220893814376936, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 15662 + }, + { + "epoch": 0.15663, + "grad_norm": 0.8508278807804223, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 15663 + }, + { + "epoch": 0.15664, + "grad_norm": 0.8810822277320196, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 15664 + }, + { + "epoch": 0.15665, + "grad_norm": 0.8979826756292933, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 15665 + }, + { + "epoch": 0.15666, + "grad_norm": 1.0794416018717115, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 15666 + }, + { + "epoch": 0.15667, + "grad_norm": 1.1323057680415858, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 15667 + }, + { + "epoch": 0.15668, + "grad_norm": 0.9457768178013904, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 15668 + }, + { + "epoch": 0.15669, + "grad_norm": 0.822780495101964, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 15669 + }, + { + "epoch": 0.1567, + "grad_norm": 0.7925062459807524, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 15670 + }, + { + "epoch": 0.15671, + "grad_norm": 0.7373144217051462, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 15671 + }, + { + "epoch": 0.15672, + "grad_norm": 0.6886291315707866, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 15672 + }, + { + "epoch": 0.15673, + "grad_norm": 0.744018490301524, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 15673 + }, + { + "epoch": 0.15674, + "grad_norm": 0.898881703571009, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 15674 + }, + { + "epoch": 0.15675, + "grad_norm": 1.0405910871431263, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 15675 + }, + { + "epoch": 0.15676, + "grad_norm": 1.096964710533686, + "learning_rate": 0.003, + "loss": 4.091, + "step": 15676 + }, + { + "epoch": 0.15677, + "grad_norm": 0.9826366381431736, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 15677 + }, + { + "epoch": 0.15678, + "grad_norm": 0.8258066792600829, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 15678 + }, + { + "epoch": 0.15679, + "grad_norm": 0.6680673033501574, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 15679 + }, + { + "epoch": 0.1568, + "grad_norm": 0.6398973115746407, + "learning_rate": 0.003, + "loss": 4.052, + "step": 15680 + }, + { + "epoch": 0.15681, + "grad_norm": 0.7222595822300055, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 15681 + }, + { + "epoch": 0.15682, + "grad_norm": 0.8086635202275088, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 15682 + }, + { + "epoch": 0.15683, + "grad_norm": 0.977112135109623, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 15683 + }, + { + "epoch": 0.15684, + "grad_norm": 1.109455056486782, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 15684 + }, + { + "epoch": 0.15685, + "grad_norm": 0.9090749241980511, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 15685 + }, + { + "epoch": 0.15686, + "grad_norm": 0.9628260634567355, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 15686 + }, + { + "epoch": 0.15687, + "grad_norm": 0.9642946528879622, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 15687 + }, + { + "epoch": 0.15688, + "grad_norm": 0.854095914822299, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 15688 + }, + { + "epoch": 0.15689, + "grad_norm": 0.8479285272959167, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 15689 + }, + { + "epoch": 0.1569, + "grad_norm": 0.8781520499741814, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 15690 + }, + { + "epoch": 0.15691, + "grad_norm": 0.9017737170528728, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 15691 + }, + { + "epoch": 0.15692, + "grad_norm": 1.0384170734104432, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 15692 + }, + { + "epoch": 0.15693, + "grad_norm": 1.0401987567002433, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 15693 + }, + { + "epoch": 0.15694, + "grad_norm": 1.0079068238266227, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 15694 + }, + { + "epoch": 0.15695, + "grad_norm": 1.0256911911475113, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 15695 + }, + { + "epoch": 0.15696, + "grad_norm": 0.9743371047297614, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 15696 + }, + { + "epoch": 0.15697, + "grad_norm": 0.9372429085578399, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 15697 + }, + { + "epoch": 0.15698, + "grad_norm": 0.8421375798506733, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 15698 + }, + { + "epoch": 0.15699, + "grad_norm": 0.8581421722938858, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 15699 + }, + { + "epoch": 0.157, + "grad_norm": 0.9545500654908698, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 15700 + }, + { + "epoch": 0.15701, + "grad_norm": 0.9543262188975177, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 15701 + }, + { + "epoch": 0.15702, + "grad_norm": 0.9627815404535414, + "learning_rate": 0.003, + "loss": 4.069, + "step": 15702 + }, + { + "epoch": 0.15703, + "grad_norm": 0.9918419255125808, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 15703 + }, + { + "epoch": 0.15704, + "grad_norm": 1.0131857783053495, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 15704 + }, + { + "epoch": 0.15705, + "grad_norm": 1.0257031076667074, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 15705 + }, + { + "epoch": 0.15706, + "grad_norm": 1.0286957803718653, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 15706 + }, + { + "epoch": 0.15707, + "grad_norm": 0.9186980426032404, + "learning_rate": 0.003, + "loss": 4.076, + "step": 15707 + }, + { + "epoch": 0.15708, + "grad_norm": 0.7730894632026283, + "learning_rate": 0.003, + "loss": 4.077, + "step": 15708 + }, + { + "epoch": 0.15709, + "grad_norm": 0.7607058204213273, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 15709 + }, + { + "epoch": 0.1571, + "grad_norm": 0.847158509950108, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 15710 + }, + { + "epoch": 0.15711, + "grad_norm": 0.7855540977503699, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 15711 + }, + { + "epoch": 0.15712, + "grad_norm": 0.7590691475382797, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 15712 + }, + { + "epoch": 0.15713, + "grad_norm": 0.7515180294257787, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 15713 + }, + { + "epoch": 0.15714, + "grad_norm": 0.9504277167206577, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 15714 + }, + { + "epoch": 0.15715, + "grad_norm": 1.198440609131435, + "learning_rate": 0.003, + "loss": 4.061, + "step": 15715 + }, + { + "epoch": 0.15716, + "grad_norm": 1.0858578296053583, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 15716 + }, + { + "epoch": 0.15717, + "grad_norm": 1.0199720058816553, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 15717 + }, + { + "epoch": 0.15718, + "grad_norm": 0.8520616767350729, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 15718 + }, + { + "epoch": 0.15719, + "grad_norm": 0.8426834502301647, + "learning_rate": 0.003, + "loss": 4.074, + "step": 15719 + }, + { + "epoch": 0.1572, + "grad_norm": 0.7956666337344602, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 15720 + }, + { + "epoch": 0.15721, + "grad_norm": 0.8129533212501311, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 15721 + }, + { + "epoch": 0.15722, + "grad_norm": 0.7928362352001691, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 15722 + }, + { + "epoch": 0.15723, + "grad_norm": 0.829214299157527, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 15723 + }, + { + "epoch": 0.15724, + "grad_norm": 0.919715823163542, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 15724 + }, + { + "epoch": 0.15725, + "grad_norm": 1.1444190031629045, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 15725 + }, + { + "epoch": 0.15726, + "grad_norm": 0.9073740489717936, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 15726 + }, + { + "epoch": 0.15727, + "grad_norm": 0.7513967054772299, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 15727 + }, + { + "epoch": 0.15728, + "grad_norm": 0.6879061506877912, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 15728 + }, + { + "epoch": 0.15729, + "grad_norm": 0.7177779412606914, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 15729 + }, + { + "epoch": 0.1573, + "grad_norm": 0.694528917822137, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 15730 + }, + { + "epoch": 0.15731, + "grad_norm": 0.5816306084682613, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 15731 + }, + { + "epoch": 0.15732, + "grad_norm": 0.6184272707278754, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 15732 + }, + { + "epoch": 0.15733, + "grad_norm": 0.6447521953877523, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 15733 + }, + { + "epoch": 0.15734, + "grad_norm": 0.7368951937763425, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 15734 + }, + { + "epoch": 0.15735, + "grad_norm": 0.8355992550878152, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 15735 + }, + { + "epoch": 0.15736, + "grad_norm": 1.0353058417685639, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 15736 + }, + { + "epoch": 0.15737, + "grad_norm": 0.9629005530656198, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 15737 + }, + { + "epoch": 0.15738, + "grad_norm": 1.1452608606326131, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 15738 + }, + { + "epoch": 0.15739, + "grad_norm": 0.8067676323285322, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 15739 + }, + { + "epoch": 0.1574, + "grad_norm": 0.6876944333280933, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 15740 + }, + { + "epoch": 0.15741, + "grad_norm": 0.6487259660436814, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 15741 + }, + { + "epoch": 0.15742, + "grad_norm": 0.6728003567964077, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 15742 + }, + { + "epoch": 0.15743, + "grad_norm": 0.7232384826137596, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 15743 + }, + { + "epoch": 0.15744, + "grad_norm": 0.7748727714315641, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 15744 + }, + { + "epoch": 0.15745, + "grad_norm": 0.9137185643541321, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 15745 + }, + { + "epoch": 0.15746, + "grad_norm": 1.0152304776651153, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 15746 + }, + { + "epoch": 0.15747, + "grad_norm": 1.2635504910008515, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 15747 + }, + { + "epoch": 0.15748, + "grad_norm": 0.9544033390249997, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 15748 + }, + { + "epoch": 0.15749, + "grad_norm": 0.8535703853345411, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 15749 + }, + { + "epoch": 0.1575, + "grad_norm": 0.9144994248160686, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 15750 + }, + { + "epoch": 0.15751, + "grad_norm": 1.1130684787630263, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 15751 + }, + { + "epoch": 0.15752, + "grad_norm": 0.9559660859466867, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 15752 + }, + { + "epoch": 0.15753, + "grad_norm": 0.9439558528154831, + "learning_rate": 0.003, + "loss": 4.085, + "step": 15753 + }, + { + "epoch": 0.15754, + "grad_norm": 1.0810191752234264, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 15754 + }, + { + "epoch": 0.15755, + "grad_norm": 0.9101250314218337, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 15755 + }, + { + "epoch": 0.15756, + "grad_norm": 0.80897549279076, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 15756 + }, + { + "epoch": 0.15757, + "grad_norm": 0.6965237862948986, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 15757 + }, + { + "epoch": 0.15758, + "grad_norm": 0.8162714537084819, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 15758 + }, + { + "epoch": 0.15759, + "grad_norm": 0.9764982859149774, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 15759 + }, + { + "epoch": 0.1576, + "grad_norm": 1.0017412746249308, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 15760 + }, + { + "epoch": 0.15761, + "grad_norm": 0.8578906445848595, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 15761 + }, + { + "epoch": 0.15762, + "grad_norm": 0.7995798485272619, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 15762 + }, + { + "epoch": 0.15763, + "grad_norm": 0.8079100884732389, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 15763 + }, + { + "epoch": 0.15764, + "grad_norm": 0.8170771717330821, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 15764 + }, + { + "epoch": 0.15765, + "grad_norm": 0.8462133152211715, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 15765 + }, + { + "epoch": 0.15766, + "grad_norm": 0.9314144890134789, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 15766 + }, + { + "epoch": 0.15767, + "grad_norm": 0.9572502108046494, + "learning_rate": 0.003, + "loss": 4.096, + "step": 15767 + }, + { + "epoch": 0.15768, + "grad_norm": 1.1736672094140246, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 15768 + }, + { + "epoch": 0.15769, + "grad_norm": 0.9717898579580005, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 15769 + }, + { + "epoch": 0.1577, + "grad_norm": 0.9281000936883017, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 15770 + }, + { + "epoch": 0.15771, + "grad_norm": 0.9756625332243586, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 15771 + }, + { + "epoch": 0.15772, + "grad_norm": 1.0761356894686833, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 15772 + }, + { + "epoch": 0.15773, + "grad_norm": 1.0718172437686129, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 15773 + }, + { + "epoch": 0.15774, + "grad_norm": 0.8564932899366225, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 15774 + }, + { + "epoch": 0.15775, + "grad_norm": 0.8429720031221283, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 15775 + }, + { + "epoch": 0.15776, + "grad_norm": 0.8145436415841807, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 15776 + }, + { + "epoch": 0.15777, + "grad_norm": 0.7495825811763993, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 15777 + }, + { + "epoch": 0.15778, + "grad_norm": 0.66056685707336, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 15778 + }, + { + "epoch": 0.15779, + "grad_norm": 0.7236335334459462, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 15779 + }, + { + "epoch": 0.1578, + "grad_norm": 0.7928322617373862, + "learning_rate": 0.003, + "loss": 4.078, + "step": 15780 + }, + { + "epoch": 0.15781, + "grad_norm": 0.9811309359272324, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 15781 + }, + { + "epoch": 0.15782, + "grad_norm": 1.2922646757724578, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 15782 + }, + { + "epoch": 0.15783, + "grad_norm": 0.613175412425735, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 15783 + }, + { + "epoch": 0.15784, + "grad_norm": 0.8101068988943512, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 15784 + }, + { + "epoch": 0.15785, + "grad_norm": 1.0459800872368754, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 15785 + }, + { + "epoch": 0.15786, + "grad_norm": 0.9335789478468234, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 15786 + }, + { + "epoch": 0.15787, + "grad_norm": 0.8549098445919725, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 15787 + }, + { + "epoch": 0.15788, + "grad_norm": 0.6652901808347063, + "learning_rate": 0.003, + "loss": 4.086, + "step": 15788 + }, + { + "epoch": 0.15789, + "grad_norm": 0.7645271509465255, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 15789 + }, + { + "epoch": 0.1579, + "grad_norm": 0.881282108531678, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 15790 + }, + { + "epoch": 0.15791, + "grad_norm": 0.8842507702863724, + "learning_rate": 0.003, + "loss": 4.079, + "step": 15791 + }, + { + "epoch": 0.15792, + "grad_norm": 0.9161298484082427, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 15792 + }, + { + "epoch": 0.15793, + "grad_norm": 0.8918615565759648, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 15793 + }, + { + "epoch": 0.15794, + "grad_norm": 0.8461781028666253, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 15794 + }, + { + "epoch": 0.15795, + "grad_norm": 0.796053179876573, + "learning_rate": 0.003, + "loss": 4.082, + "step": 15795 + }, + { + "epoch": 0.15796, + "grad_norm": 0.7884645267478809, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 15796 + }, + { + "epoch": 0.15797, + "grad_norm": 0.8402589525212326, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 15797 + }, + { + "epoch": 0.15798, + "grad_norm": 0.9791280961884749, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 15798 + }, + { + "epoch": 0.15799, + "grad_norm": 1.0466519313257117, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 15799 + }, + { + "epoch": 0.158, + "grad_norm": 0.9546207591549601, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 15800 + }, + { + "epoch": 0.15801, + "grad_norm": 0.9295367269240018, + "learning_rate": 0.003, + "loss": 4.066, + "step": 15801 + }, + { + "epoch": 0.15802, + "grad_norm": 0.8491558503792144, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 15802 + }, + { + "epoch": 0.15803, + "grad_norm": 0.7909596313425348, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 15803 + }, + { + "epoch": 0.15804, + "grad_norm": 0.8561291185647388, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 15804 + }, + { + "epoch": 0.15805, + "grad_norm": 0.9291004483511431, + "learning_rate": 0.003, + "loss": 4.09, + "step": 15805 + }, + { + "epoch": 0.15806, + "grad_norm": 1.0798417303303218, + "learning_rate": 0.003, + "loss": 4.09, + "step": 15806 + }, + { + "epoch": 0.15807, + "grad_norm": 0.9130461523318819, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 15807 + }, + { + "epoch": 0.15808, + "grad_norm": 0.7153070233852723, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 15808 + }, + { + "epoch": 0.15809, + "grad_norm": 0.6466685309472603, + "learning_rate": 0.003, + "loss": 4.061, + "step": 15809 + }, + { + "epoch": 0.1581, + "grad_norm": 0.7137648139929387, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 15810 + }, + { + "epoch": 0.15811, + "grad_norm": 0.7371865634872482, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 15811 + }, + { + "epoch": 0.15812, + "grad_norm": 0.666837145811955, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 15812 + }, + { + "epoch": 0.15813, + "grad_norm": 0.6849383084673003, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 15813 + }, + { + "epoch": 0.15814, + "grad_norm": 0.7690573260358415, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 15814 + }, + { + "epoch": 0.15815, + "grad_norm": 0.9014056808320806, + "learning_rate": 0.003, + "loss": 4.06, + "step": 15815 + }, + { + "epoch": 0.15816, + "grad_norm": 0.9974616340619585, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 15816 + }, + { + "epoch": 0.15817, + "grad_norm": 1.0018362625024821, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 15817 + }, + { + "epoch": 0.15818, + "grad_norm": 1.0045306907978533, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 15818 + }, + { + "epoch": 0.15819, + "grad_norm": 1.044796734807557, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 15819 + }, + { + "epoch": 0.1582, + "grad_norm": 1.1060677062990496, + "learning_rate": 0.003, + "loss": 4.1162, + "step": 15820 + }, + { + "epoch": 0.15821, + "grad_norm": 0.9625014805197428, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 15821 + }, + { + "epoch": 0.15822, + "grad_norm": 1.1276907425219156, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 15822 + }, + { + "epoch": 0.15823, + "grad_norm": 0.9231495786036672, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 15823 + }, + { + "epoch": 0.15824, + "grad_norm": 0.7210559162759669, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 15824 + }, + { + "epoch": 0.15825, + "grad_norm": 0.6689758088881779, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 15825 + }, + { + "epoch": 0.15826, + "grad_norm": 0.7991481396230212, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 15826 + }, + { + "epoch": 0.15827, + "grad_norm": 0.9195265416903654, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 15827 + }, + { + "epoch": 0.15828, + "grad_norm": 0.9227451049506976, + "learning_rate": 0.003, + "loss": 4.05, + "step": 15828 + }, + { + "epoch": 0.15829, + "grad_norm": 1.183853321591796, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 15829 + }, + { + "epoch": 0.1583, + "grad_norm": 1.116908744750503, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 15830 + }, + { + "epoch": 0.15831, + "grad_norm": 0.9280804561427755, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 15831 + }, + { + "epoch": 0.15832, + "grad_norm": 0.6993523895866569, + "learning_rate": 0.003, + "loss": 4.069, + "step": 15832 + }, + { + "epoch": 0.15833, + "grad_norm": 0.6778153881642555, + "learning_rate": 0.003, + "loss": 4.041, + "step": 15833 + }, + { + "epoch": 0.15834, + "grad_norm": 0.6710842473104999, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 15834 + }, + { + "epoch": 0.15835, + "grad_norm": 0.7940880968918053, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 15835 + }, + { + "epoch": 0.15836, + "grad_norm": 0.9486913042238366, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 15836 + }, + { + "epoch": 0.15837, + "grad_norm": 0.9574558816711672, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 15837 + }, + { + "epoch": 0.15838, + "grad_norm": 0.8016829926384446, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 15838 + }, + { + "epoch": 0.15839, + "grad_norm": 0.8767245454791264, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 15839 + }, + { + "epoch": 0.1584, + "grad_norm": 0.9753319003274548, + "learning_rate": 0.003, + "loss": 4.061, + "step": 15840 + }, + { + "epoch": 0.15841, + "grad_norm": 0.935985741070316, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 15841 + }, + { + "epoch": 0.15842, + "grad_norm": 0.9731181664130097, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 15842 + }, + { + "epoch": 0.15843, + "grad_norm": 0.9810573764018233, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 15843 + }, + { + "epoch": 0.15844, + "grad_norm": 1.0527692285822723, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 15844 + }, + { + "epoch": 0.15845, + "grad_norm": 0.9724371067157719, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 15845 + }, + { + "epoch": 0.15846, + "grad_norm": 0.9355618632943368, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 15846 + }, + { + "epoch": 0.15847, + "grad_norm": 0.8765585917398276, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 15847 + }, + { + "epoch": 0.15848, + "grad_norm": 0.8960885793948482, + "learning_rate": 0.003, + "loss": 4.059, + "step": 15848 + }, + { + "epoch": 0.15849, + "grad_norm": 0.9078083133473381, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 15849 + }, + { + "epoch": 0.1585, + "grad_norm": 0.9588368589979, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 15850 + }, + { + "epoch": 0.15851, + "grad_norm": 1.0538067692342266, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 15851 + }, + { + "epoch": 0.15852, + "grad_norm": 0.8705415590237465, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 15852 + }, + { + "epoch": 0.15853, + "grad_norm": 0.7556140365323665, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 15853 + }, + { + "epoch": 0.15854, + "grad_norm": 0.708190257785594, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 15854 + }, + { + "epoch": 0.15855, + "grad_norm": 0.7239228060794102, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 15855 + }, + { + "epoch": 0.15856, + "grad_norm": 0.7071674787914626, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 15856 + }, + { + "epoch": 0.15857, + "grad_norm": 0.9379712638299085, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 15857 + }, + { + "epoch": 0.15858, + "grad_norm": 1.1165791795695725, + "learning_rate": 0.003, + "loss": 4.095, + "step": 15858 + }, + { + "epoch": 0.15859, + "grad_norm": 0.8722408835796578, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 15859 + }, + { + "epoch": 0.1586, + "grad_norm": 0.7530543252965523, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 15860 + }, + { + "epoch": 0.15861, + "grad_norm": 0.6665608961709756, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 15861 + }, + { + "epoch": 0.15862, + "grad_norm": 0.730624445103437, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 15862 + }, + { + "epoch": 0.15863, + "grad_norm": 0.8559718685431023, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 15863 + }, + { + "epoch": 0.15864, + "grad_norm": 1.0446524778821076, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 15864 + }, + { + "epoch": 0.15865, + "grad_norm": 0.9897685739575387, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 15865 + }, + { + "epoch": 0.15866, + "grad_norm": 0.8793218934252126, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 15866 + }, + { + "epoch": 0.15867, + "grad_norm": 0.8602972215958646, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 15867 + }, + { + "epoch": 0.15868, + "grad_norm": 0.8087072696062023, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 15868 + }, + { + "epoch": 0.15869, + "grad_norm": 0.7744091911306118, + "learning_rate": 0.003, + "loss": 4.035, + "step": 15869 + }, + { + "epoch": 0.1587, + "grad_norm": 0.8032899597741882, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 15870 + }, + { + "epoch": 0.15871, + "grad_norm": 0.8212848129479703, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 15871 + }, + { + "epoch": 0.15872, + "grad_norm": 0.8098673310288629, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 15872 + }, + { + "epoch": 0.15873, + "grad_norm": 0.8344842706362949, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 15873 + }, + { + "epoch": 0.15874, + "grad_norm": 0.9609025124157962, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 15874 + }, + { + "epoch": 0.15875, + "grad_norm": 1.166264056059148, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 15875 + }, + { + "epoch": 0.15876, + "grad_norm": 0.9426802309816715, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 15876 + }, + { + "epoch": 0.15877, + "grad_norm": 0.9748216070848362, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 15877 + }, + { + "epoch": 0.15878, + "grad_norm": 0.9652460088843033, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 15878 + }, + { + "epoch": 0.15879, + "grad_norm": 0.9470132798977262, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 15879 + }, + { + "epoch": 0.1588, + "grad_norm": 0.8330772838015335, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 15880 + }, + { + "epoch": 0.15881, + "grad_norm": 0.8834610872328756, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 15881 + }, + { + "epoch": 0.15882, + "grad_norm": 0.9353522346926292, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 15882 + }, + { + "epoch": 0.15883, + "grad_norm": 0.8955176576822342, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 15883 + }, + { + "epoch": 0.15884, + "grad_norm": 0.9101327685911393, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 15884 + }, + { + "epoch": 0.15885, + "grad_norm": 0.8630667318408559, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 15885 + }, + { + "epoch": 0.15886, + "grad_norm": 0.8895435355974495, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 15886 + }, + { + "epoch": 0.15887, + "grad_norm": 1.02701068155811, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 15887 + }, + { + "epoch": 0.15888, + "grad_norm": 1.1528507805179966, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 15888 + }, + { + "epoch": 0.15889, + "grad_norm": 0.8222781462504181, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 15889 + }, + { + "epoch": 0.1589, + "grad_norm": 0.6987602441668954, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 15890 + }, + { + "epoch": 0.15891, + "grad_norm": 0.7600823317276932, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 15891 + }, + { + "epoch": 0.15892, + "grad_norm": 0.8391905920734278, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 15892 + }, + { + "epoch": 0.15893, + "grad_norm": 0.8387386380485324, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 15893 + }, + { + "epoch": 0.15894, + "grad_norm": 0.7891534314815928, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 15894 + }, + { + "epoch": 0.15895, + "grad_norm": 0.810900494614915, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 15895 + }, + { + "epoch": 0.15896, + "grad_norm": 0.9182939036553969, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 15896 + }, + { + "epoch": 0.15897, + "grad_norm": 0.9616215424728389, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 15897 + }, + { + "epoch": 0.15898, + "grad_norm": 0.8999451087939887, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 15898 + }, + { + "epoch": 0.15899, + "grad_norm": 0.9174561922257051, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 15899 + }, + { + "epoch": 0.159, + "grad_norm": 1.0073397840502547, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 15900 + }, + { + "epoch": 0.15901, + "grad_norm": 0.8570122026857621, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 15901 + }, + { + "epoch": 0.15902, + "grad_norm": 0.7256120389509747, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 15902 + }, + { + "epoch": 0.15903, + "grad_norm": 0.77451479463265, + "learning_rate": 0.003, + "loss": 4.07, + "step": 15903 + }, + { + "epoch": 0.15904, + "grad_norm": 0.8142437498841877, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 15904 + }, + { + "epoch": 0.15905, + "grad_norm": 0.8886496045970114, + "learning_rate": 0.003, + "loss": 4.058, + "step": 15905 + }, + { + "epoch": 0.15906, + "grad_norm": 0.7823787993194367, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 15906 + }, + { + "epoch": 0.15907, + "grad_norm": 0.8576597012670907, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 15907 + }, + { + "epoch": 0.15908, + "grad_norm": 1.0226556653999712, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 15908 + }, + { + "epoch": 0.15909, + "grad_norm": 1.228440531695168, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 15909 + }, + { + "epoch": 0.1591, + "grad_norm": 0.8072323047436248, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 15910 + }, + { + "epoch": 0.15911, + "grad_norm": 0.6625593171708525, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 15911 + }, + { + "epoch": 0.15912, + "grad_norm": 0.7641361854965768, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 15912 + }, + { + "epoch": 0.15913, + "grad_norm": 0.8955766167997712, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 15913 + }, + { + "epoch": 0.15914, + "grad_norm": 0.9284289402262057, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 15914 + }, + { + "epoch": 0.15915, + "grad_norm": 0.8998828107354967, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 15915 + }, + { + "epoch": 0.15916, + "grad_norm": 0.9402371663310323, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 15916 + }, + { + "epoch": 0.15917, + "grad_norm": 0.9902722681553752, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 15917 + }, + { + "epoch": 0.15918, + "grad_norm": 1.0974782386344044, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 15918 + }, + { + "epoch": 0.15919, + "grad_norm": 0.8974947705604499, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 15919 + }, + { + "epoch": 0.1592, + "grad_norm": 0.8323694012148697, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 15920 + }, + { + "epoch": 0.15921, + "grad_norm": 0.8507533872744127, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 15921 + }, + { + "epoch": 0.15922, + "grad_norm": 1.0006896458398264, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 15922 + }, + { + "epoch": 0.15923, + "grad_norm": 1.0863181557188903, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 15923 + }, + { + "epoch": 0.15924, + "grad_norm": 0.9521719258873104, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 15924 + }, + { + "epoch": 0.15925, + "grad_norm": 0.944414005035015, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 15925 + }, + { + "epoch": 0.15926, + "grad_norm": 0.9296021380224746, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 15926 + }, + { + "epoch": 0.15927, + "grad_norm": 0.8613107161757301, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 15927 + }, + { + "epoch": 0.15928, + "grad_norm": 0.9249367893175384, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 15928 + }, + { + "epoch": 0.15929, + "grad_norm": 0.8878680131531572, + "learning_rate": 0.003, + "loss": 4.086, + "step": 15929 + }, + { + "epoch": 0.1593, + "grad_norm": 1.013715090911043, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 15930 + }, + { + "epoch": 0.15931, + "grad_norm": 0.8510340941979475, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 15931 + }, + { + "epoch": 0.15932, + "grad_norm": 0.815676870885383, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 15932 + }, + { + "epoch": 0.15933, + "grad_norm": 0.7743675386553625, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 15933 + }, + { + "epoch": 0.15934, + "grad_norm": 0.7403873546053269, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 15934 + }, + { + "epoch": 0.15935, + "grad_norm": 0.7858125515042295, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 15935 + }, + { + "epoch": 0.15936, + "grad_norm": 0.8930344673960163, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 15936 + }, + { + "epoch": 0.15937, + "grad_norm": 0.9316602155800011, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 15937 + }, + { + "epoch": 0.15938, + "grad_norm": 0.8816423475685182, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 15938 + }, + { + "epoch": 0.15939, + "grad_norm": 0.8023319702925781, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 15939 + }, + { + "epoch": 0.1594, + "grad_norm": 0.8651903663949028, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 15940 + }, + { + "epoch": 0.15941, + "grad_norm": 0.8852802210463416, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 15941 + }, + { + "epoch": 0.15942, + "grad_norm": 0.8149679450595194, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 15942 + }, + { + "epoch": 0.15943, + "grad_norm": 0.7056486815572971, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 15943 + }, + { + "epoch": 0.15944, + "grad_norm": 0.7494177680373358, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 15944 + }, + { + "epoch": 0.15945, + "grad_norm": 0.8463757146270932, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 15945 + }, + { + "epoch": 0.15946, + "grad_norm": 0.9484762575642697, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 15946 + }, + { + "epoch": 0.15947, + "grad_norm": 1.2333563410789112, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 15947 + }, + { + "epoch": 0.15948, + "grad_norm": 0.7909947284917025, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 15948 + }, + { + "epoch": 0.15949, + "grad_norm": 0.7306097375896525, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 15949 + }, + { + "epoch": 0.1595, + "grad_norm": 0.7554572014212151, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 15950 + }, + { + "epoch": 0.15951, + "grad_norm": 0.8045348607371821, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 15951 + }, + { + "epoch": 0.15952, + "grad_norm": 0.8632792409610709, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 15952 + }, + { + "epoch": 0.15953, + "grad_norm": 0.8832684523003634, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 15953 + }, + { + "epoch": 0.15954, + "grad_norm": 0.9299209913713404, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 15954 + }, + { + "epoch": 0.15955, + "grad_norm": 1.032021770746047, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 15955 + }, + { + "epoch": 0.15956, + "grad_norm": 0.9818817360177301, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 15956 + }, + { + "epoch": 0.15957, + "grad_norm": 0.9918576095944156, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 15957 + }, + { + "epoch": 0.15958, + "grad_norm": 0.9072398848104601, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 15958 + }, + { + "epoch": 0.15959, + "grad_norm": 0.8848674987600716, + "learning_rate": 0.003, + "loss": 4.079, + "step": 15959 + }, + { + "epoch": 0.1596, + "grad_norm": 0.8117353518274283, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 15960 + }, + { + "epoch": 0.15961, + "grad_norm": 0.813328243786487, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 15961 + }, + { + "epoch": 0.15962, + "grad_norm": 0.977835910106764, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 15962 + }, + { + "epoch": 0.15963, + "grad_norm": 1.2892851496837776, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 15963 + }, + { + "epoch": 0.15964, + "grad_norm": 0.6903747880207948, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 15964 + }, + { + "epoch": 0.15965, + "grad_norm": 0.6844003685678792, + "learning_rate": 0.003, + "loss": 4.075, + "step": 15965 + }, + { + "epoch": 0.15966, + "grad_norm": 0.664731853915502, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 15966 + }, + { + "epoch": 0.15967, + "grad_norm": 0.7169235637818511, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 15967 + }, + { + "epoch": 0.15968, + "grad_norm": 0.8060018350301563, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 15968 + }, + { + "epoch": 0.15969, + "grad_norm": 0.9091242557631898, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 15969 + }, + { + "epoch": 0.1597, + "grad_norm": 1.070008204035234, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 15970 + }, + { + "epoch": 0.15971, + "grad_norm": 1.2103233997348608, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 15971 + }, + { + "epoch": 0.15972, + "grad_norm": 0.8094656173431708, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 15972 + }, + { + "epoch": 0.15973, + "grad_norm": 0.8478586830339004, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 15973 + }, + { + "epoch": 0.15974, + "grad_norm": 0.8653546240953361, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 15974 + }, + { + "epoch": 0.15975, + "grad_norm": 0.8860408958322518, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 15975 + }, + { + "epoch": 0.15976, + "grad_norm": 0.9102600963263386, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 15976 + }, + { + "epoch": 0.15977, + "grad_norm": 0.8727421657068137, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 15977 + }, + { + "epoch": 0.15978, + "grad_norm": 0.9107200803049118, + "learning_rate": 0.003, + "loss": 4.074, + "step": 15978 + }, + { + "epoch": 0.15979, + "grad_norm": 0.9341700720119386, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 15979 + }, + { + "epoch": 0.1598, + "grad_norm": 0.8318930244131669, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 15980 + }, + { + "epoch": 0.15981, + "grad_norm": 0.7454439507132611, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 15981 + }, + { + "epoch": 0.15982, + "grad_norm": 0.9058637695553168, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 15982 + }, + { + "epoch": 0.15983, + "grad_norm": 1.2318091116124625, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 15983 + }, + { + "epoch": 0.15984, + "grad_norm": 0.928161210640171, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 15984 + }, + { + "epoch": 0.15985, + "grad_norm": 0.8825323891365938, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 15985 + }, + { + "epoch": 0.15986, + "grad_norm": 0.9331902724573091, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 15986 + }, + { + "epoch": 0.15987, + "grad_norm": 0.9173291854109743, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 15987 + }, + { + "epoch": 0.15988, + "grad_norm": 0.6923402307071086, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 15988 + }, + { + "epoch": 0.15989, + "grad_norm": 0.6881565806819077, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 15989 + }, + { + "epoch": 0.1599, + "grad_norm": 0.6415710035119405, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 15990 + }, + { + "epoch": 0.15991, + "grad_norm": 0.5702418106814733, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 15991 + }, + { + "epoch": 0.15992, + "grad_norm": 0.5454637420160554, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 15992 + }, + { + "epoch": 0.15993, + "grad_norm": 0.6275972535879244, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 15993 + }, + { + "epoch": 0.15994, + "grad_norm": 0.6948256041813252, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 15994 + }, + { + "epoch": 0.15995, + "grad_norm": 0.7618122762233865, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 15995 + }, + { + "epoch": 0.15996, + "grad_norm": 0.7733683885650522, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 15996 + }, + { + "epoch": 0.15997, + "grad_norm": 0.9205548246120419, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 15997 + }, + { + "epoch": 0.15998, + "grad_norm": 1.0012476550778662, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 15998 + }, + { + "epoch": 0.15999, + "grad_norm": 0.8926619011650007, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 15999 + }, + { + "epoch": 0.16, + "grad_norm": 1.04034913772308, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 16000 + }, + { + "epoch": 0.16001, + "grad_norm": 0.9875749945099409, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 16001 + }, + { + "epoch": 0.16002, + "grad_norm": 1.0522854332233935, + "learning_rate": 0.003, + "loss": 4.07, + "step": 16002 + }, + { + "epoch": 0.16003, + "grad_norm": 0.7770874892608689, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 16003 + }, + { + "epoch": 0.16004, + "grad_norm": 0.7574820985938998, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 16004 + }, + { + "epoch": 0.16005, + "grad_norm": 0.7585189135995564, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 16005 + }, + { + "epoch": 0.16006, + "grad_norm": 0.6546507720652547, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 16006 + }, + { + "epoch": 0.16007, + "grad_norm": 0.7581247263003393, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 16007 + }, + { + "epoch": 0.16008, + "grad_norm": 0.8826284331558714, + "learning_rate": 0.003, + "loss": 4.087, + "step": 16008 + }, + { + "epoch": 0.16009, + "grad_norm": 1.240712531102964, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 16009 + }, + { + "epoch": 0.1601, + "grad_norm": 0.9538344035686259, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 16010 + }, + { + "epoch": 0.16011, + "grad_norm": 1.033857866287827, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 16011 + }, + { + "epoch": 0.16012, + "grad_norm": 0.9311632932211753, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 16012 + }, + { + "epoch": 0.16013, + "grad_norm": 1.0491791126642829, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 16013 + }, + { + "epoch": 0.16014, + "grad_norm": 0.9839521570660503, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 16014 + }, + { + "epoch": 0.16015, + "grad_norm": 0.8702868304503665, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 16015 + }, + { + "epoch": 0.16016, + "grad_norm": 0.7530861667542754, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 16016 + }, + { + "epoch": 0.16017, + "grad_norm": 0.7147199273034064, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 16017 + }, + { + "epoch": 0.16018, + "grad_norm": 0.6433067677650978, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 16018 + }, + { + "epoch": 0.16019, + "grad_norm": 0.7594542328789683, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 16019 + }, + { + "epoch": 0.1602, + "grad_norm": 0.8970739684818545, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 16020 + }, + { + "epoch": 0.16021, + "grad_norm": 0.8490208137519246, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 16021 + }, + { + "epoch": 0.16022, + "grad_norm": 0.9183771077368851, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 16022 + }, + { + "epoch": 0.16023, + "grad_norm": 1.0917793172404378, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 16023 + }, + { + "epoch": 0.16024, + "grad_norm": 0.9644313592043875, + "learning_rate": 0.003, + "loss": 4.079, + "step": 16024 + }, + { + "epoch": 0.16025, + "grad_norm": 0.9459752941651064, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 16025 + }, + { + "epoch": 0.16026, + "grad_norm": 1.025321156474423, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 16026 + }, + { + "epoch": 0.16027, + "grad_norm": 1.1053338433748303, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 16027 + }, + { + "epoch": 0.16028, + "grad_norm": 0.807135351221388, + "learning_rate": 0.003, + "loss": 4.081, + "step": 16028 + }, + { + "epoch": 0.16029, + "grad_norm": 0.6682553235156985, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 16029 + }, + { + "epoch": 0.1603, + "grad_norm": 0.5598976813828049, + "learning_rate": 0.003, + "loss": 4.053, + "step": 16030 + }, + { + "epoch": 0.16031, + "grad_norm": 0.6408595513391798, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 16031 + }, + { + "epoch": 0.16032, + "grad_norm": 0.6611918144827753, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 16032 + }, + { + "epoch": 0.16033, + "grad_norm": 0.7211422238082564, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 16033 + }, + { + "epoch": 0.16034, + "grad_norm": 0.7644557015482096, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 16034 + }, + { + "epoch": 0.16035, + "grad_norm": 0.8034077903207645, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 16035 + }, + { + "epoch": 0.16036, + "grad_norm": 0.8301832588548755, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 16036 + }, + { + "epoch": 0.16037, + "grad_norm": 0.8829561319755103, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 16037 + }, + { + "epoch": 0.16038, + "grad_norm": 0.9062856939981632, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 16038 + }, + { + "epoch": 0.16039, + "grad_norm": 1.0435050301640487, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 16039 + }, + { + "epoch": 0.1604, + "grad_norm": 1.161696773773414, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 16040 + }, + { + "epoch": 0.16041, + "grad_norm": 1.049776658985168, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 16041 + }, + { + "epoch": 0.16042, + "grad_norm": 1.0095540675816592, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 16042 + }, + { + "epoch": 0.16043, + "grad_norm": 1.0389961580110167, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 16043 + }, + { + "epoch": 0.16044, + "grad_norm": 1.1219833074583003, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 16044 + }, + { + "epoch": 0.16045, + "grad_norm": 0.9769731729552488, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 16045 + }, + { + "epoch": 0.16046, + "grad_norm": 0.9821796117147715, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 16046 + }, + { + "epoch": 0.16047, + "grad_norm": 0.9522902886577452, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 16047 + }, + { + "epoch": 0.16048, + "grad_norm": 0.9445776198863215, + "learning_rate": 0.003, + "loss": 4.069, + "step": 16048 + }, + { + "epoch": 0.16049, + "grad_norm": 0.9887852799914124, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 16049 + }, + { + "epoch": 0.1605, + "grad_norm": 0.9282871554540153, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 16050 + }, + { + "epoch": 0.16051, + "grad_norm": 0.8937769659977602, + "learning_rate": 0.003, + "loss": 4.073, + "step": 16051 + }, + { + "epoch": 0.16052, + "grad_norm": 0.8438392490479742, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 16052 + }, + { + "epoch": 0.16053, + "grad_norm": 0.8548799870414077, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 16053 + }, + { + "epoch": 0.16054, + "grad_norm": 0.8207204685878131, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 16054 + }, + { + "epoch": 0.16055, + "grad_norm": 0.9141771764995624, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 16055 + }, + { + "epoch": 0.16056, + "grad_norm": 0.9888082350106229, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 16056 + }, + { + "epoch": 0.16057, + "grad_norm": 1.0735077625299911, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 16057 + }, + { + "epoch": 0.16058, + "grad_norm": 1.0721273987403106, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 16058 + }, + { + "epoch": 0.16059, + "grad_norm": 1.1176266745551053, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 16059 + }, + { + "epoch": 0.1606, + "grad_norm": 0.969389072573312, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 16060 + }, + { + "epoch": 0.16061, + "grad_norm": 0.8224590607768597, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 16061 + }, + { + "epoch": 0.16062, + "grad_norm": 0.6720431701334851, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 16062 + }, + { + "epoch": 0.16063, + "grad_norm": 0.6818729632002152, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 16063 + }, + { + "epoch": 0.16064, + "grad_norm": 0.8985636731615916, + "learning_rate": 0.003, + "loss": 4.094, + "step": 16064 + }, + { + "epoch": 0.16065, + "grad_norm": 1.0579805261802582, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 16065 + }, + { + "epoch": 0.16066, + "grad_norm": 1.0907066243086723, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 16066 + }, + { + "epoch": 0.16067, + "grad_norm": 0.7791578863985551, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 16067 + }, + { + "epoch": 0.16068, + "grad_norm": 0.5591335499491482, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 16068 + }, + { + "epoch": 0.16069, + "grad_norm": 0.591376055821003, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 16069 + }, + { + "epoch": 0.1607, + "grad_norm": 0.6512621136286194, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 16070 + }, + { + "epoch": 0.16071, + "grad_norm": 0.6576654985720259, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 16071 + }, + { + "epoch": 0.16072, + "grad_norm": 0.674312163956186, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 16072 + }, + { + "epoch": 0.16073, + "grad_norm": 0.6518324043878526, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 16073 + }, + { + "epoch": 0.16074, + "grad_norm": 0.6953964500504463, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 16074 + }, + { + "epoch": 0.16075, + "grad_norm": 0.7318374110395267, + "learning_rate": 0.003, + "loss": 4.078, + "step": 16075 + }, + { + "epoch": 0.16076, + "grad_norm": 0.738169566379988, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 16076 + }, + { + "epoch": 0.16077, + "grad_norm": 0.6507978114501953, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 16077 + }, + { + "epoch": 0.16078, + "grad_norm": 0.6444506288956432, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 16078 + }, + { + "epoch": 0.16079, + "grad_norm": 0.7626020296482732, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 16079 + }, + { + "epoch": 0.1608, + "grad_norm": 0.9860731495646577, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 16080 + }, + { + "epoch": 0.16081, + "grad_norm": 1.0900955865738118, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 16081 + }, + { + "epoch": 0.16082, + "grad_norm": 1.053783767638632, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 16082 + }, + { + "epoch": 0.16083, + "grad_norm": 1.101311764095707, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 16083 + }, + { + "epoch": 0.16084, + "grad_norm": 1.225144577378712, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 16084 + }, + { + "epoch": 0.16085, + "grad_norm": 1.0988604664404609, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 16085 + }, + { + "epoch": 0.16086, + "grad_norm": 1.0143867562647895, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 16086 + }, + { + "epoch": 0.16087, + "grad_norm": 0.9728535251670195, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 16087 + }, + { + "epoch": 0.16088, + "grad_norm": 1.1536823978110107, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 16088 + }, + { + "epoch": 0.16089, + "grad_norm": 0.8434130155671425, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 16089 + }, + { + "epoch": 0.1609, + "grad_norm": 0.6284904218935911, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 16090 + }, + { + "epoch": 0.16091, + "grad_norm": 0.6057989632237925, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 16091 + }, + { + "epoch": 0.16092, + "grad_norm": 0.7277031337734261, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 16092 + }, + { + "epoch": 0.16093, + "grad_norm": 0.8613467424478359, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 16093 + }, + { + "epoch": 0.16094, + "grad_norm": 0.992589572106217, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 16094 + }, + { + "epoch": 0.16095, + "grad_norm": 1.0328894614323847, + "learning_rate": 0.003, + "loss": 4.047, + "step": 16095 + }, + { + "epoch": 0.16096, + "grad_norm": 1.0775067185238585, + "learning_rate": 0.003, + "loss": 4.075, + "step": 16096 + }, + { + "epoch": 0.16097, + "grad_norm": 0.9481167996653196, + "learning_rate": 0.003, + "loss": 4.045, + "step": 16097 + }, + { + "epoch": 0.16098, + "grad_norm": 0.8310343218773544, + "learning_rate": 0.003, + "loss": 4.075, + "step": 16098 + }, + { + "epoch": 0.16099, + "grad_norm": 0.7432804786692967, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 16099 + }, + { + "epoch": 0.161, + "grad_norm": 0.7651025363039288, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 16100 + }, + { + "epoch": 0.16101, + "grad_norm": 0.8010871738088194, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 16101 + }, + { + "epoch": 0.16102, + "grad_norm": 0.9549984349511024, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 16102 + }, + { + "epoch": 0.16103, + "grad_norm": 0.8802420693876036, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 16103 + }, + { + "epoch": 0.16104, + "grad_norm": 0.7570305416904907, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 16104 + }, + { + "epoch": 0.16105, + "grad_norm": 0.6899773517005993, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 16105 + }, + { + "epoch": 0.16106, + "grad_norm": 0.6658011982893003, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 16106 + }, + { + "epoch": 0.16107, + "grad_norm": 0.7506078945169913, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 16107 + }, + { + "epoch": 0.16108, + "grad_norm": 0.7424462464592597, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 16108 + }, + { + "epoch": 0.16109, + "grad_norm": 0.7785698489989537, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 16109 + }, + { + "epoch": 0.1611, + "grad_norm": 0.9224817582034468, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 16110 + }, + { + "epoch": 0.16111, + "grad_norm": 0.888232403600876, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 16111 + }, + { + "epoch": 0.16112, + "grad_norm": 0.8482124293727162, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 16112 + }, + { + "epoch": 0.16113, + "grad_norm": 0.7901927949141969, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 16113 + }, + { + "epoch": 0.16114, + "grad_norm": 0.9037691266050597, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 16114 + }, + { + "epoch": 0.16115, + "grad_norm": 0.9356281618274787, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 16115 + }, + { + "epoch": 0.16116, + "grad_norm": 0.8696043376245159, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 16116 + }, + { + "epoch": 0.16117, + "grad_norm": 0.954833084830157, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 16117 + }, + { + "epoch": 0.16118, + "grad_norm": 0.9819884195188994, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 16118 + }, + { + "epoch": 0.16119, + "grad_norm": 1.1803300259419853, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 16119 + }, + { + "epoch": 0.1612, + "grad_norm": 1.042483615683666, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 16120 + }, + { + "epoch": 0.16121, + "grad_norm": 1.0832236766831873, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 16121 + }, + { + "epoch": 0.16122, + "grad_norm": 1.1181535977453274, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 16122 + }, + { + "epoch": 0.16123, + "grad_norm": 0.852356323752036, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 16123 + }, + { + "epoch": 0.16124, + "grad_norm": 0.7660958063238035, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 16124 + }, + { + "epoch": 0.16125, + "grad_norm": 0.8917390529772087, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 16125 + }, + { + "epoch": 0.16126, + "grad_norm": 1.0485061599011214, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 16126 + }, + { + "epoch": 0.16127, + "grad_norm": 1.1550677872719373, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 16127 + }, + { + "epoch": 0.16128, + "grad_norm": 1.2033352556382457, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 16128 + }, + { + "epoch": 0.16129, + "grad_norm": 0.9479689526076787, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 16129 + }, + { + "epoch": 0.1613, + "grad_norm": 1.00085432316174, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 16130 + }, + { + "epoch": 0.16131, + "grad_norm": 1.0426525513597849, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 16131 + }, + { + "epoch": 0.16132, + "grad_norm": 0.9472206142456094, + "learning_rate": 0.003, + "loss": 4.08, + "step": 16132 + }, + { + "epoch": 0.16133, + "grad_norm": 0.8465359997349694, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 16133 + }, + { + "epoch": 0.16134, + "grad_norm": 0.8444626961435644, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 16134 + }, + { + "epoch": 0.16135, + "grad_norm": 0.7612020993107949, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 16135 + }, + { + "epoch": 0.16136, + "grad_norm": 0.8235281739406003, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 16136 + }, + { + "epoch": 0.16137, + "grad_norm": 0.8727770642878132, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 16137 + }, + { + "epoch": 0.16138, + "grad_norm": 1.0650874817500309, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 16138 + }, + { + "epoch": 0.16139, + "grad_norm": 0.997039583973425, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 16139 + }, + { + "epoch": 0.1614, + "grad_norm": 0.9335305364102751, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 16140 + }, + { + "epoch": 0.16141, + "grad_norm": 0.8764782582415178, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 16141 + }, + { + "epoch": 0.16142, + "grad_norm": 0.8602530505109808, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 16142 + }, + { + "epoch": 0.16143, + "grad_norm": 0.8516346837710956, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 16143 + }, + { + "epoch": 0.16144, + "grad_norm": 0.8040861469272487, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 16144 + }, + { + "epoch": 0.16145, + "grad_norm": 0.8534615396130611, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 16145 + }, + { + "epoch": 0.16146, + "grad_norm": 0.9310365006212695, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 16146 + }, + { + "epoch": 0.16147, + "grad_norm": 0.9047052432143592, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 16147 + }, + { + "epoch": 0.16148, + "grad_norm": 0.8300437671265596, + "learning_rate": 0.003, + "loss": 4.063, + "step": 16148 + }, + { + "epoch": 0.16149, + "grad_norm": 0.796095988427987, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 16149 + }, + { + "epoch": 0.1615, + "grad_norm": 0.6754321803911064, + "learning_rate": 0.003, + "loss": 4.081, + "step": 16150 + }, + { + "epoch": 0.16151, + "grad_norm": 0.5722295073166785, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 16151 + }, + { + "epoch": 0.16152, + "grad_norm": 0.6519936639812628, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 16152 + }, + { + "epoch": 0.16153, + "grad_norm": 0.8071980548334277, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 16153 + }, + { + "epoch": 0.16154, + "grad_norm": 0.9527683328836535, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 16154 + }, + { + "epoch": 0.16155, + "grad_norm": 1.0365543089232858, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 16155 + }, + { + "epoch": 0.16156, + "grad_norm": 0.866323296095971, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 16156 + }, + { + "epoch": 0.16157, + "grad_norm": 0.7802931145542943, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 16157 + }, + { + "epoch": 0.16158, + "grad_norm": 0.9022402072547404, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 16158 + }, + { + "epoch": 0.16159, + "grad_norm": 0.9185278921751916, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 16159 + }, + { + "epoch": 0.1616, + "grad_norm": 0.7779480960036026, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 16160 + }, + { + "epoch": 0.16161, + "grad_norm": 0.8540115917124532, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 16161 + }, + { + "epoch": 0.16162, + "grad_norm": 0.8710301392729666, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 16162 + }, + { + "epoch": 0.16163, + "grad_norm": 0.9185723429298399, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 16163 + }, + { + "epoch": 0.16164, + "grad_norm": 0.870149263886394, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 16164 + }, + { + "epoch": 0.16165, + "grad_norm": 0.9590652445806417, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 16165 + }, + { + "epoch": 0.16166, + "grad_norm": 0.9705187919682385, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 16166 + }, + { + "epoch": 0.16167, + "grad_norm": 0.9354814546160687, + "learning_rate": 0.003, + "loss": 4.074, + "step": 16167 + }, + { + "epoch": 0.16168, + "grad_norm": 1.0353083177703744, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 16168 + }, + { + "epoch": 0.16169, + "grad_norm": 1.139545834970642, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 16169 + }, + { + "epoch": 0.1617, + "grad_norm": 0.9189672899307466, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 16170 + }, + { + "epoch": 0.16171, + "grad_norm": 0.9529781619985794, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 16171 + }, + { + "epoch": 0.16172, + "grad_norm": 1.0523261081221165, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 16172 + }, + { + "epoch": 0.16173, + "grad_norm": 0.9449027969526601, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 16173 + }, + { + "epoch": 0.16174, + "grad_norm": 0.9196952296341383, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 16174 + }, + { + "epoch": 0.16175, + "grad_norm": 0.868460902537329, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 16175 + }, + { + "epoch": 0.16176, + "grad_norm": 0.7378373702584629, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 16176 + }, + { + "epoch": 0.16177, + "grad_norm": 0.7264787377825251, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 16177 + }, + { + "epoch": 0.16178, + "grad_norm": 0.754651947342741, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 16178 + }, + { + "epoch": 0.16179, + "grad_norm": 0.8981779971221018, + "learning_rate": 0.003, + "loss": 4.058, + "step": 16179 + }, + { + "epoch": 0.1618, + "grad_norm": 0.8890031517849919, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 16180 + }, + { + "epoch": 0.16181, + "grad_norm": 0.7993844784574181, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 16181 + }, + { + "epoch": 0.16182, + "grad_norm": 0.8383713944082801, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 16182 + }, + { + "epoch": 0.16183, + "grad_norm": 0.741935624659754, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 16183 + }, + { + "epoch": 0.16184, + "grad_norm": 0.9150947015891732, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 16184 + }, + { + "epoch": 0.16185, + "grad_norm": 1.14201213309905, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 16185 + }, + { + "epoch": 0.16186, + "grad_norm": 0.8169479642039301, + "learning_rate": 0.003, + "loss": 4.045, + "step": 16186 + }, + { + "epoch": 0.16187, + "grad_norm": 0.6175338474335003, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 16187 + }, + { + "epoch": 0.16188, + "grad_norm": 0.5067716276572606, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 16188 + }, + { + "epoch": 0.16189, + "grad_norm": 0.630339607808424, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 16189 + }, + { + "epoch": 0.1619, + "grad_norm": 0.7737387391209762, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 16190 + }, + { + "epoch": 0.16191, + "grad_norm": 0.9102922482934057, + "learning_rate": 0.003, + "loss": 4.065, + "step": 16191 + }, + { + "epoch": 0.16192, + "grad_norm": 1.0551965593201214, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 16192 + }, + { + "epoch": 0.16193, + "grad_norm": 0.8780697635294824, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 16193 + }, + { + "epoch": 0.16194, + "grad_norm": 0.7980214042007789, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 16194 + }, + { + "epoch": 0.16195, + "grad_norm": 0.8988465824796351, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 16195 + }, + { + "epoch": 0.16196, + "grad_norm": 0.8228969895381656, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 16196 + }, + { + "epoch": 0.16197, + "grad_norm": 0.8983174915373545, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 16197 + }, + { + "epoch": 0.16198, + "grad_norm": 1.1260559439354179, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 16198 + }, + { + "epoch": 0.16199, + "grad_norm": 0.9390686499553168, + "learning_rate": 0.003, + "loss": 4.064, + "step": 16199 + }, + { + "epoch": 0.162, + "grad_norm": 0.8152299144993529, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 16200 + }, + { + "epoch": 0.16201, + "grad_norm": 0.7360914535721991, + "learning_rate": 0.003, + "loss": 4.075, + "step": 16201 + }, + { + "epoch": 0.16202, + "grad_norm": 0.7622137232192114, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 16202 + }, + { + "epoch": 0.16203, + "grad_norm": 0.7413943410115538, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 16203 + }, + { + "epoch": 0.16204, + "grad_norm": 0.7977514074970313, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 16204 + }, + { + "epoch": 0.16205, + "grad_norm": 0.7892478107568978, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 16205 + }, + { + "epoch": 0.16206, + "grad_norm": 0.720041606861047, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 16206 + }, + { + "epoch": 0.16207, + "grad_norm": 0.8672112136682307, + "learning_rate": 0.003, + "loss": 4.075, + "step": 16207 + }, + { + "epoch": 0.16208, + "grad_norm": 1.0494020699201347, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 16208 + }, + { + "epoch": 0.16209, + "grad_norm": 1.1249674854929692, + "learning_rate": 0.003, + "loss": 4.079, + "step": 16209 + }, + { + "epoch": 0.1621, + "grad_norm": 1.024550523762066, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 16210 + }, + { + "epoch": 0.16211, + "grad_norm": 1.0137782212236954, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 16211 + }, + { + "epoch": 0.16212, + "grad_norm": 0.9786722594139657, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 16212 + }, + { + "epoch": 0.16213, + "grad_norm": 0.9969853986796966, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 16213 + }, + { + "epoch": 0.16214, + "grad_norm": 1.0754979535674585, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 16214 + }, + { + "epoch": 0.16215, + "grad_norm": 0.9568793780351604, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 16215 + }, + { + "epoch": 0.16216, + "grad_norm": 1.1455624548738463, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 16216 + }, + { + "epoch": 0.16217, + "grad_norm": 1.0490097896393278, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 16217 + }, + { + "epoch": 0.16218, + "grad_norm": 1.0202644948426063, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 16218 + }, + { + "epoch": 0.16219, + "grad_norm": 1.0276796887123196, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 16219 + }, + { + "epoch": 0.1622, + "grad_norm": 1.0593292616804686, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 16220 + }, + { + "epoch": 0.16221, + "grad_norm": 1.0020651740485054, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 16221 + }, + { + "epoch": 0.16222, + "grad_norm": 1.1298337838433747, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 16222 + }, + { + "epoch": 0.16223, + "grad_norm": 1.0897648396658126, + "learning_rate": 0.003, + "loss": 4.103, + "step": 16223 + }, + { + "epoch": 0.16224, + "grad_norm": 0.9519637682676307, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 16224 + }, + { + "epoch": 0.16225, + "grad_norm": 0.953671025969703, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 16225 + }, + { + "epoch": 0.16226, + "grad_norm": 0.9896970202695674, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 16226 + }, + { + "epoch": 0.16227, + "grad_norm": 1.0032716113127798, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 16227 + }, + { + "epoch": 0.16228, + "grad_norm": 0.9738652448770629, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 16228 + }, + { + "epoch": 0.16229, + "grad_norm": 0.7674792332968209, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 16229 + }, + { + "epoch": 0.1623, + "grad_norm": 0.7193398489790191, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 16230 + }, + { + "epoch": 0.16231, + "grad_norm": 0.7309786942515929, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 16231 + }, + { + "epoch": 0.16232, + "grad_norm": 0.7492285412052875, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 16232 + }, + { + "epoch": 0.16233, + "grad_norm": 0.8021556523508814, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 16233 + }, + { + "epoch": 0.16234, + "grad_norm": 1.0145900379495671, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 16234 + }, + { + "epoch": 0.16235, + "grad_norm": 1.1075616915131692, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 16235 + }, + { + "epoch": 0.16236, + "grad_norm": 0.8213397231729862, + "learning_rate": 0.003, + "loss": 4.066, + "step": 16236 + }, + { + "epoch": 0.16237, + "grad_norm": 0.6768504042704822, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 16237 + }, + { + "epoch": 0.16238, + "grad_norm": 0.722693548204655, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 16238 + }, + { + "epoch": 0.16239, + "grad_norm": 0.6765664675242864, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 16239 + }, + { + "epoch": 0.1624, + "grad_norm": 0.629076717424327, + "learning_rate": 0.003, + "loss": 4.059, + "step": 16240 + }, + { + "epoch": 0.16241, + "grad_norm": 0.6743156519209437, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 16241 + }, + { + "epoch": 0.16242, + "grad_norm": 0.8544287885854247, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 16242 + }, + { + "epoch": 0.16243, + "grad_norm": 1.104586476829831, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 16243 + }, + { + "epoch": 0.16244, + "grad_norm": 1.059421360909398, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 16244 + }, + { + "epoch": 0.16245, + "grad_norm": 0.8718112501399637, + "learning_rate": 0.003, + "loss": 4.051, + "step": 16245 + }, + { + "epoch": 0.16246, + "grad_norm": 0.7612164093055407, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 16246 + }, + { + "epoch": 0.16247, + "grad_norm": 0.7014308272271207, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 16247 + }, + { + "epoch": 0.16248, + "grad_norm": 0.804493152182718, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 16248 + }, + { + "epoch": 0.16249, + "grad_norm": 0.8753722381900808, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 16249 + }, + { + "epoch": 0.1625, + "grad_norm": 0.9105179582557639, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 16250 + }, + { + "epoch": 0.16251, + "grad_norm": 0.9568872467193253, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 16251 + }, + { + "epoch": 0.16252, + "grad_norm": 0.9229114089817099, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 16252 + }, + { + "epoch": 0.16253, + "grad_norm": 0.8563003071325918, + "learning_rate": 0.003, + "loss": 4.075, + "step": 16253 + }, + { + "epoch": 0.16254, + "grad_norm": 0.8947039093540049, + "learning_rate": 0.003, + "loss": 4.041, + "step": 16254 + }, + { + "epoch": 0.16255, + "grad_norm": 0.8562350552382546, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 16255 + }, + { + "epoch": 0.16256, + "grad_norm": 0.7873504536371866, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 16256 + }, + { + "epoch": 0.16257, + "grad_norm": 0.7461171081796859, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 16257 + }, + { + "epoch": 0.16258, + "grad_norm": 0.674715451156988, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 16258 + }, + { + "epoch": 0.16259, + "grad_norm": 0.7035220419556732, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 16259 + }, + { + "epoch": 0.1626, + "grad_norm": 0.7835199425538674, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 16260 + }, + { + "epoch": 0.16261, + "grad_norm": 0.9023162600106548, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 16261 + }, + { + "epoch": 0.16262, + "grad_norm": 0.9670145461010528, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 16262 + }, + { + "epoch": 0.16263, + "grad_norm": 0.9734628038161928, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 16263 + }, + { + "epoch": 0.16264, + "grad_norm": 0.9188835579076302, + "learning_rate": 0.003, + "loss": 4.075, + "step": 16264 + }, + { + "epoch": 0.16265, + "grad_norm": 0.8256702747886675, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 16265 + }, + { + "epoch": 0.16266, + "grad_norm": 0.7592208986754003, + "learning_rate": 0.003, + "loss": 4.068, + "step": 16266 + }, + { + "epoch": 0.16267, + "grad_norm": 0.8039117002212741, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 16267 + }, + { + "epoch": 0.16268, + "grad_norm": 0.8947686350506168, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 16268 + }, + { + "epoch": 0.16269, + "grad_norm": 0.8243573580472494, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 16269 + }, + { + "epoch": 0.1627, + "grad_norm": 0.6575010590174963, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 16270 + }, + { + "epoch": 0.16271, + "grad_norm": 0.7610139261035422, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 16271 + }, + { + "epoch": 0.16272, + "grad_norm": 0.8343058454375321, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 16272 + }, + { + "epoch": 0.16273, + "grad_norm": 0.8281436786429534, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 16273 + }, + { + "epoch": 0.16274, + "grad_norm": 0.7678649443323914, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 16274 + }, + { + "epoch": 0.16275, + "grad_norm": 0.7551691228694758, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 16275 + }, + { + "epoch": 0.16276, + "grad_norm": 0.8269508965204964, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 16276 + }, + { + "epoch": 0.16277, + "grad_norm": 0.8912629174403209, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 16277 + }, + { + "epoch": 0.16278, + "grad_norm": 0.9975366156148245, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 16278 + }, + { + "epoch": 0.16279, + "grad_norm": 1.1451233189077932, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 16279 + }, + { + "epoch": 0.1628, + "grad_norm": 1.0487951683788876, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 16280 + }, + { + "epoch": 0.16281, + "grad_norm": 1.0172643716337553, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 16281 + }, + { + "epoch": 0.16282, + "grad_norm": 1.0773619764096158, + "learning_rate": 0.003, + "loss": 4.053, + "step": 16282 + }, + { + "epoch": 0.16283, + "grad_norm": 0.8691246985842682, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 16283 + }, + { + "epoch": 0.16284, + "grad_norm": 0.968909349697707, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 16284 + }, + { + "epoch": 0.16285, + "grad_norm": 1.1473100685823328, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 16285 + }, + { + "epoch": 0.16286, + "grad_norm": 0.8849533566376779, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 16286 + }, + { + "epoch": 0.16287, + "grad_norm": 0.9774613453573878, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 16287 + }, + { + "epoch": 0.16288, + "grad_norm": 1.0977809366380906, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 16288 + }, + { + "epoch": 0.16289, + "grad_norm": 0.8561584769481401, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 16289 + }, + { + "epoch": 0.1629, + "grad_norm": 0.7848931043950328, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 16290 + }, + { + "epoch": 0.16291, + "grad_norm": 0.7802439977245792, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 16291 + }, + { + "epoch": 0.16292, + "grad_norm": 0.7651178311141256, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 16292 + }, + { + "epoch": 0.16293, + "grad_norm": 0.8685065968678868, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 16293 + }, + { + "epoch": 0.16294, + "grad_norm": 0.9237856864310275, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 16294 + }, + { + "epoch": 0.16295, + "grad_norm": 1.1278124372935312, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 16295 + }, + { + "epoch": 0.16296, + "grad_norm": 0.9891264765467419, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 16296 + }, + { + "epoch": 0.16297, + "grad_norm": 0.966613287257267, + "learning_rate": 0.003, + "loss": 4.072, + "step": 16297 + }, + { + "epoch": 0.16298, + "grad_norm": 0.710801390925547, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 16298 + }, + { + "epoch": 0.16299, + "grad_norm": 0.6152415750940189, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 16299 + }, + { + "epoch": 0.163, + "grad_norm": 0.7568654652124776, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 16300 + }, + { + "epoch": 0.16301, + "grad_norm": 0.9004162565321375, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 16301 + }, + { + "epoch": 0.16302, + "grad_norm": 0.9466994903295309, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 16302 + }, + { + "epoch": 0.16303, + "grad_norm": 0.9583379791654986, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 16303 + }, + { + "epoch": 0.16304, + "grad_norm": 0.8963929816095652, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 16304 + }, + { + "epoch": 0.16305, + "grad_norm": 0.853690250397017, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 16305 + }, + { + "epoch": 0.16306, + "grad_norm": 0.8474023517120366, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 16306 + }, + { + "epoch": 0.16307, + "grad_norm": 0.8141426006933553, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 16307 + }, + { + "epoch": 0.16308, + "grad_norm": 0.7883105319317605, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 16308 + }, + { + "epoch": 0.16309, + "grad_norm": 0.623205437017482, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 16309 + }, + { + "epoch": 0.1631, + "grad_norm": 0.6984950064714907, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 16310 + }, + { + "epoch": 0.16311, + "grad_norm": 0.7218842752055934, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 16311 + }, + { + "epoch": 0.16312, + "grad_norm": 0.7694544368850195, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 16312 + }, + { + "epoch": 0.16313, + "grad_norm": 0.8263370653136152, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 16313 + }, + { + "epoch": 0.16314, + "grad_norm": 0.8317574056320102, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 16314 + }, + { + "epoch": 0.16315, + "grad_norm": 0.7535022241153165, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 16315 + }, + { + "epoch": 0.16316, + "grad_norm": 0.8417341764302408, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 16316 + }, + { + "epoch": 0.16317, + "grad_norm": 0.9901599491024954, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 16317 + }, + { + "epoch": 0.16318, + "grad_norm": 1.0004399101085264, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 16318 + }, + { + "epoch": 0.16319, + "grad_norm": 1.0692991592092003, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 16319 + }, + { + "epoch": 0.1632, + "grad_norm": 0.9561987828264448, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 16320 + }, + { + "epoch": 0.16321, + "grad_norm": 1.0251712944716842, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 16321 + }, + { + "epoch": 0.16322, + "grad_norm": 0.9816010445769573, + "learning_rate": 0.003, + "loss": 4.077, + "step": 16322 + }, + { + "epoch": 0.16323, + "grad_norm": 1.101057119290071, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 16323 + }, + { + "epoch": 0.16324, + "grad_norm": 1.0524633263040601, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 16324 + }, + { + "epoch": 0.16325, + "grad_norm": 0.8864279589063383, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 16325 + }, + { + "epoch": 0.16326, + "grad_norm": 0.8902968713402165, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 16326 + }, + { + "epoch": 0.16327, + "grad_norm": 1.0234400798281675, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 16327 + }, + { + "epoch": 0.16328, + "grad_norm": 0.9621135258676727, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 16328 + }, + { + "epoch": 0.16329, + "grad_norm": 1.0055178417417585, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 16329 + }, + { + "epoch": 0.1633, + "grad_norm": 1.0265496718395943, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 16330 + }, + { + "epoch": 0.16331, + "grad_norm": 0.7396114122702113, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 16331 + }, + { + "epoch": 0.16332, + "grad_norm": 0.6812936739431915, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 16332 + }, + { + "epoch": 0.16333, + "grad_norm": 0.6706419805801713, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 16333 + }, + { + "epoch": 0.16334, + "grad_norm": 0.7073675785764628, + "learning_rate": 0.003, + "loss": 4.082, + "step": 16334 + }, + { + "epoch": 0.16335, + "grad_norm": 0.7459792886555676, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 16335 + }, + { + "epoch": 0.16336, + "grad_norm": 0.9629007161107034, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 16336 + }, + { + "epoch": 0.16337, + "grad_norm": 1.0902010043538788, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 16337 + }, + { + "epoch": 0.16338, + "grad_norm": 0.8025447207521035, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 16338 + }, + { + "epoch": 0.16339, + "grad_norm": 0.7361805815115818, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 16339 + }, + { + "epoch": 0.1634, + "grad_norm": 0.6540008185315194, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 16340 + }, + { + "epoch": 0.16341, + "grad_norm": 0.688424768221228, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 16341 + }, + { + "epoch": 0.16342, + "grad_norm": 0.797647322547362, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 16342 + }, + { + "epoch": 0.16343, + "grad_norm": 0.8786609196121857, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 16343 + }, + { + "epoch": 0.16344, + "grad_norm": 0.9689555421782381, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 16344 + }, + { + "epoch": 0.16345, + "grad_norm": 1.010957931909869, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 16345 + }, + { + "epoch": 0.16346, + "grad_norm": 1.1043381808748678, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 16346 + }, + { + "epoch": 0.16347, + "grad_norm": 0.8627539680852754, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 16347 + }, + { + "epoch": 0.16348, + "grad_norm": 0.9415344862017608, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 16348 + }, + { + "epoch": 0.16349, + "grad_norm": 0.9722106862140105, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 16349 + }, + { + "epoch": 0.1635, + "grad_norm": 0.9225391759167083, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 16350 + }, + { + "epoch": 0.16351, + "grad_norm": 0.9050708322400156, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 16351 + }, + { + "epoch": 0.16352, + "grad_norm": 0.9648755421215274, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 16352 + }, + { + "epoch": 0.16353, + "grad_norm": 0.94978974355883, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 16353 + }, + { + "epoch": 0.16354, + "grad_norm": 0.8414682214556534, + "learning_rate": 0.003, + "loss": 4.097, + "step": 16354 + }, + { + "epoch": 0.16355, + "grad_norm": 0.8776397789694375, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 16355 + }, + { + "epoch": 0.16356, + "grad_norm": 0.9422987811838714, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 16356 + }, + { + "epoch": 0.16357, + "grad_norm": 0.8873496308807577, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 16357 + }, + { + "epoch": 0.16358, + "grad_norm": 0.9517019081662924, + "learning_rate": 0.003, + "loss": 4.093, + "step": 16358 + }, + { + "epoch": 0.16359, + "grad_norm": 0.8816314640473634, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 16359 + }, + { + "epoch": 0.1636, + "grad_norm": 0.9012518179501146, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 16360 + }, + { + "epoch": 0.16361, + "grad_norm": 1.072329376855215, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 16361 + }, + { + "epoch": 0.16362, + "grad_norm": 1.1132639228417447, + "learning_rate": 0.003, + "loss": 4.101, + "step": 16362 + }, + { + "epoch": 0.16363, + "grad_norm": 0.9357152587625169, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 16363 + }, + { + "epoch": 0.16364, + "grad_norm": 0.86584581619107, + "learning_rate": 0.003, + "loss": 4.089, + "step": 16364 + }, + { + "epoch": 0.16365, + "grad_norm": 0.8856370055503388, + "learning_rate": 0.003, + "loss": 4.082, + "step": 16365 + }, + { + "epoch": 0.16366, + "grad_norm": 0.9123103930623118, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 16366 + }, + { + "epoch": 0.16367, + "grad_norm": 0.8428889183502868, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 16367 + }, + { + "epoch": 0.16368, + "grad_norm": 0.9180145944409419, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 16368 + }, + { + "epoch": 0.16369, + "grad_norm": 0.829519571789251, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 16369 + }, + { + "epoch": 0.1637, + "grad_norm": 0.7900204959878786, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 16370 + }, + { + "epoch": 0.16371, + "grad_norm": 0.6797779972401826, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 16371 + }, + { + "epoch": 0.16372, + "grad_norm": 0.7262520028616694, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 16372 + }, + { + "epoch": 0.16373, + "grad_norm": 0.6769507847507134, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 16373 + }, + { + "epoch": 0.16374, + "grad_norm": 0.6338324807541841, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 16374 + }, + { + "epoch": 0.16375, + "grad_norm": 0.5978557858805862, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 16375 + }, + { + "epoch": 0.16376, + "grad_norm": 0.6375385979274388, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 16376 + }, + { + "epoch": 0.16377, + "grad_norm": 0.7618874770471668, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 16377 + }, + { + "epoch": 0.16378, + "grad_norm": 0.960624593465279, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 16378 + }, + { + "epoch": 0.16379, + "grad_norm": 1.307498950894558, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 16379 + }, + { + "epoch": 0.1638, + "grad_norm": 0.6842670158933465, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 16380 + }, + { + "epoch": 0.16381, + "grad_norm": 0.6605379375265492, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 16381 + }, + { + "epoch": 0.16382, + "grad_norm": 0.7131883422986482, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 16382 + }, + { + "epoch": 0.16383, + "grad_norm": 0.7352761940986067, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 16383 + }, + { + "epoch": 0.16384, + "grad_norm": 0.8802239132267842, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 16384 + }, + { + "epoch": 0.16385, + "grad_norm": 1.2980147396968764, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 16385 + }, + { + "epoch": 0.16386, + "grad_norm": 0.8717943810458164, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 16386 + }, + { + "epoch": 0.16387, + "grad_norm": 0.9473289171900946, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 16387 + }, + { + "epoch": 0.16388, + "grad_norm": 1.1518247231894105, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 16388 + }, + { + "epoch": 0.16389, + "grad_norm": 0.9102925832056002, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 16389 + }, + { + "epoch": 0.1639, + "grad_norm": 0.865928627298715, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 16390 + }, + { + "epoch": 0.16391, + "grad_norm": 0.8432577037598782, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 16391 + }, + { + "epoch": 0.16392, + "grad_norm": 0.8210471714324126, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 16392 + }, + { + "epoch": 0.16393, + "grad_norm": 0.7105093895533194, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 16393 + }, + { + "epoch": 0.16394, + "grad_norm": 0.6653408636561565, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 16394 + }, + { + "epoch": 0.16395, + "grad_norm": 0.642852993617922, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 16395 + }, + { + "epoch": 0.16396, + "grad_norm": 0.6657922469312689, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 16396 + }, + { + "epoch": 0.16397, + "grad_norm": 0.734892109340266, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 16397 + }, + { + "epoch": 0.16398, + "grad_norm": 0.9900340614929757, + "learning_rate": 0.003, + "loss": 4.064, + "step": 16398 + }, + { + "epoch": 0.16399, + "grad_norm": 1.281382027096473, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 16399 + }, + { + "epoch": 0.164, + "grad_norm": 0.7489538583502504, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 16400 + }, + { + "epoch": 0.16401, + "grad_norm": 0.819926182966586, + "learning_rate": 0.003, + "loss": 4.05, + "step": 16401 + }, + { + "epoch": 0.16402, + "grad_norm": 0.7900644421362657, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 16402 + }, + { + "epoch": 0.16403, + "grad_norm": 0.8795568912895309, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 16403 + }, + { + "epoch": 0.16404, + "grad_norm": 0.9187639580508348, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 16404 + }, + { + "epoch": 0.16405, + "grad_norm": 1.0592065825007824, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 16405 + }, + { + "epoch": 0.16406, + "grad_norm": 1.0370700965993263, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 16406 + }, + { + "epoch": 0.16407, + "grad_norm": 0.8660839214754755, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 16407 + }, + { + "epoch": 0.16408, + "grad_norm": 0.7866820033002769, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 16408 + }, + { + "epoch": 0.16409, + "grad_norm": 0.9302742716268162, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 16409 + }, + { + "epoch": 0.1641, + "grad_norm": 1.10096361603536, + "learning_rate": 0.003, + "loss": 4.075, + "step": 16410 + }, + { + "epoch": 0.16411, + "grad_norm": 0.8598337818949621, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 16411 + }, + { + "epoch": 0.16412, + "grad_norm": 0.8344572187111701, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 16412 + }, + { + "epoch": 0.16413, + "grad_norm": 0.935291715553534, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 16413 + }, + { + "epoch": 0.16414, + "grad_norm": 0.9773771759755003, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 16414 + }, + { + "epoch": 0.16415, + "grad_norm": 1.0974982570234062, + "learning_rate": 0.003, + "loss": 4.062, + "step": 16415 + }, + { + "epoch": 0.16416, + "grad_norm": 0.9382359630702094, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 16416 + }, + { + "epoch": 0.16417, + "grad_norm": 0.8753993968104263, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 16417 + }, + { + "epoch": 0.16418, + "grad_norm": 0.8890932886780114, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 16418 + }, + { + "epoch": 0.16419, + "grad_norm": 0.9773397196278301, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 16419 + }, + { + "epoch": 0.1642, + "grad_norm": 0.9569700811156342, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 16420 + }, + { + "epoch": 0.16421, + "grad_norm": 0.9914231867676052, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 16421 + }, + { + "epoch": 0.16422, + "grad_norm": 1.061614469418064, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 16422 + }, + { + "epoch": 0.16423, + "grad_norm": 0.9802931770326155, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 16423 + }, + { + "epoch": 0.16424, + "grad_norm": 1.0521374393850662, + "learning_rate": 0.003, + "loss": 4.079, + "step": 16424 + }, + { + "epoch": 0.16425, + "grad_norm": 0.8627856733774925, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 16425 + }, + { + "epoch": 0.16426, + "grad_norm": 0.7240879568833387, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 16426 + }, + { + "epoch": 0.16427, + "grad_norm": 0.6263409993198499, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 16427 + }, + { + "epoch": 0.16428, + "grad_norm": 0.5612375917314804, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 16428 + }, + { + "epoch": 0.16429, + "grad_norm": 0.5676625695669967, + "learning_rate": 0.003, + "loss": 4.063, + "step": 16429 + }, + { + "epoch": 0.1643, + "grad_norm": 0.635340784985962, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 16430 + }, + { + "epoch": 0.16431, + "grad_norm": 0.7579785372543869, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 16431 + }, + { + "epoch": 0.16432, + "grad_norm": 0.854104037492739, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 16432 + }, + { + "epoch": 0.16433, + "grad_norm": 0.9854975460081391, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 16433 + }, + { + "epoch": 0.16434, + "grad_norm": 1.054683157363344, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 16434 + }, + { + "epoch": 0.16435, + "grad_norm": 0.9408684826274759, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 16435 + }, + { + "epoch": 0.16436, + "grad_norm": 0.9383415831827594, + "learning_rate": 0.003, + "loss": 4.067, + "step": 16436 + }, + { + "epoch": 0.16437, + "grad_norm": 0.8808857016465607, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 16437 + }, + { + "epoch": 0.16438, + "grad_norm": 0.7824759991232686, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 16438 + }, + { + "epoch": 0.16439, + "grad_norm": 0.7433967595371673, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 16439 + }, + { + "epoch": 0.1644, + "grad_norm": 0.8026972361328236, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 16440 + }, + { + "epoch": 0.16441, + "grad_norm": 0.8636161760661323, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 16441 + }, + { + "epoch": 0.16442, + "grad_norm": 0.9149443557754856, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 16442 + }, + { + "epoch": 0.16443, + "grad_norm": 0.9370834323878338, + "learning_rate": 0.003, + "loss": 4.098, + "step": 16443 + }, + { + "epoch": 0.16444, + "grad_norm": 0.9478938788650303, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 16444 + }, + { + "epoch": 0.16445, + "grad_norm": 0.92959502518127, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 16445 + }, + { + "epoch": 0.16446, + "grad_norm": 0.8715990239886671, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 16446 + }, + { + "epoch": 0.16447, + "grad_norm": 0.9400935955138296, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 16447 + }, + { + "epoch": 0.16448, + "grad_norm": 0.9811240436203806, + "learning_rate": 0.003, + "loss": 4.092, + "step": 16448 + }, + { + "epoch": 0.16449, + "grad_norm": 0.8918189523580856, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 16449 + }, + { + "epoch": 0.1645, + "grad_norm": 0.7194053286802724, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 16450 + }, + { + "epoch": 0.16451, + "grad_norm": 0.8774922168689883, + "learning_rate": 0.003, + "loss": 4.076, + "step": 16451 + }, + { + "epoch": 0.16452, + "grad_norm": 1.0458245173342429, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 16452 + }, + { + "epoch": 0.16453, + "grad_norm": 1.0098898686793907, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 16453 + }, + { + "epoch": 0.16454, + "grad_norm": 1.048560219790088, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 16454 + }, + { + "epoch": 0.16455, + "grad_norm": 0.92618933646125, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 16455 + }, + { + "epoch": 0.16456, + "grad_norm": 0.8927600222337699, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 16456 + }, + { + "epoch": 0.16457, + "grad_norm": 0.8178539590392163, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 16457 + }, + { + "epoch": 0.16458, + "grad_norm": 0.7201786957361529, + "learning_rate": 0.003, + "loss": 4.055, + "step": 16458 + }, + { + "epoch": 0.16459, + "grad_norm": 0.8265961160008374, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 16459 + }, + { + "epoch": 0.1646, + "grad_norm": 0.8666697955312822, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 16460 + }, + { + "epoch": 0.16461, + "grad_norm": 0.87034742661441, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 16461 + }, + { + "epoch": 0.16462, + "grad_norm": 1.1384608015954887, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 16462 + }, + { + "epoch": 0.16463, + "grad_norm": 0.9153887291272492, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 16463 + }, + { + "epoch": 0.16464, + "grad_norm": 0.8115386706398814, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 16464 + }, + { + "epoch": 0.16465, + "grad_norm": 0.8466232506914497, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 16465 + }, + { + "epoch": 0.16466, + "grad_norm": 0.861983793500373, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 16466 + }, + { + "epoch": 0.16467, + "grad_norm": 0.8624133882028252, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 16467 + }, + { + "epoch": 0.16468, + "grad_norm": 0.9831016147187888, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 16468 + }, + { + "epoch": 0.16469, + "grad_norm": 1.086376722450923, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 16469 + }, + { + "epoch": 0.1647, + "grad_norm": 0.9782884286370237, + "learning_rate": 0.003, + "loss": 4.068, + "step": 16470 + }, + { + "epoch": 0.16471, + "grad_norm": 1.0671488756490388, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 16471 + }, + { + "epoch": 0.16472, + "grad_norm": 0.9754565797535374, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 16472 + }, + { + "epoch": 0.16473, + "grad_norm": 0.9847542732824185, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 16473 + }, + { + "epoch": 0.16474, + "grad_norm": 0.9742436486561515, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 16474 + }, + { + "epoch": 0.16475, + "grad_norm": 0.9689392722162178, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 16475 + }, + { + "epoch": 0.16476, + "grad_norm": 1.095996390726339, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 16476 + }, + { + "epoch": 0.16477, + "grad_norm": 0.9124930058545174, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 16477 + }, + { + "epoch": 0.16478, + "grad_norm": 0.9138302018141485, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 16478 + }, + { + "epoch": 0.16479, + "grad_norm": 0.8989537665410646, + "learning_rate": 0.003, + "loss": 4.08, + "step": 16479 + }, + { + "epoch": 0.1648, + "grad_norm": 0.9419921335711516, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 16480 + }, + { + "epoch": 0.16481, + "grad_norm": 0.9818098036597075, + "learning_rate": 0.003, + "loss": 4.08, + "step": 16481 + }, + { + "epoch": 0.16482, + "grad_norm": 0.9857383085844478, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 16482 + }, + { + "epoch": 0.16483, + "grad_norm": 0.9670746894077116, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 16483 + }, + { + "epoch": 0.16484, + "grad_norm": 1.1323020032533249, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 16484 + }, + { + "epoch": 0.16485, + "grad_norm": 0.9834520956577053, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 16485 + }, + { + "epoch": 0.16486, + "grad_norm": 0.8026751155382573, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 16486 + }, + { + "epoch": 0.16487, + "grad_norm": 0.6228904546696008, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 16487 + }, + { + "epoch": 0.16488, + "grad_norm": 0.6050614777062014, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 16488 + }, + { + "epoch": 0.16489, + "grad_norm": 0.6695305780666493, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 16489 + }, + { + "epoch": 0.1649, + "grad_norm": 0.7154645577664862, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 16490 + }, + { + "epoch": 0.16491, + "grad_norm": 0.7377130840357033, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 16491 + }, + { + "epoch": 0.16492, + "grad_norm": 0.7203077541945648, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 16492 + }, + { + "epoch": 0.16493, + "grad_norm": 0.7365393404657381, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 16493 + }, + { + "epoch": 0.16494, + "grad_norm": 0.8644012481872585, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 16494 + }, + { + "epoch": 0.16495, + "grad_norm": 1.0341330029439595, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 16495 + }, + { + "epoch": 0.16496, + "grad_norm": 1.0165424018268916, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 16496 + }, + { + "epoch": 0.16497, + "grad_norm": 0.8879322560066397, + "learning_rate": 0.003, + "loss": 4.025, + "step": 16497 + }, + { + "epoch": 0.16498, + "grad_norm": 0.7631123782889997, + "learning_rate": 0.003, + "loss": 4.014, + "step": 16498 + }, + { + "epoch": 0.16499, + "grad_norm": 0.5842517604191956, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 16499 + }, + { + "epoch": 0.165, + "grad_norm": 0.5699230875663656, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 16500 + }, + { + "epoch": 0.16501, + "grad_norm": 0.5382001439361487, + "learning_rate": 0.003, + "loss": 4.069, + "step": 16501 + }, + { + "epoch": 0.16502, + "grad_norm": 0.5982219845821781, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 16502 + }, + { + "epoch": 0.16503, + "grad_norm": 0.6010589571428604, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 16503 + }, + { + "epoch": 0.16504, + "grad_norm": 0.5939698677680427, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 16504 + }, + { + "epoch": 0.16505, + "grad_norm": 0.6108774309732249, + "learning_rate": 0.003, + "loss": 4.068, + "step": 16505 + }, + { + "epoch": 0.16506, + "grad_norm": 0.6375858425394414, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 16506 + }, + { + "epoch": 0.16507, + "grad_norm": 0.6959426724754599, + "learning_rate": 0.003, + "loss": 4.061, + "step": 16507 + }, + { + "epoch": 0.16508, + "grad_norm": 0.8847577219107231, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 16508 + }, + { + "epoch": 0.16509, + "grad_norm": 1.190871093756928, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 16509 + }, + { + "epoch": 0.1651, + "grad_norm": 0.9961046853051119, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 16510 + }, + { + "epoch": 0.16511, + "grad_norm": 1.0788096121797417, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 16511 + }, + { + "epoch": 0.16512, + "grad_norm": 0.8699285198446157, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 16512 + }, + { + "epoch": 0.16513, + "grad_norm": 0.7673644941699407, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 16513 + }, + { + "epoch": 0.16514, + "grad_norm": 0.6915244832844373, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 16514 + }, + { + "epoch": 0.16515, + "grad_norm": 0.6743873426586414, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 16515 + }, + { + "epoch": 0.16516, + "grad_norm": 0.7454244375026999, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 16516 + }, + { + "epoch": 0.16517, + "grad_norm": 0.9299992427701205, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 16517 + }, + { + "epoch": 0.16518, + "grad_norm": 1.0993407224898688, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 16518 + }, + { + "epoch": 0.16519, + "grad_norm": 0.9677895313461976, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 16519 + }, + { + "epoch": 0.1652, + "grad_norm": 0.9507837271180546, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 16520 + }, + { + "epoch": 0.16521, + "grad_norm": 0.7637014412033272, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 16521 + }, + { + "epoch": 0.16522, + "grad_norm": 0.7530902468779367, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 16522 + }, + { + "epoch": 0.16523, + "grad_norm": 0.7588941082123793, + "learning_rate": 0.003, + "loss": 4.076, + "step": 16523 + }, + { + "epoch": 0.16524, + "grad_norm": 0.7409911017180301, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 16524 + }, + { + "epoch": 0.16525, + "grad_norm": 0.7187504248398197, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 16525 + }, + { + "epoch": 0.16526, + "grad_norm": 0.6484130371471625, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 16526 + }, + { + "epoch": 0.16527, + "grad_norm": 0.7458645932079909, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 16527 + }, + { + "epoch": 0.16528, + "grad_norm": 0.926064666970427, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 16528 + }, + { + "epoch": 0.16529, + "grad_norm": 1.2064446265719213, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 16529 + }, + { + "epoch": 0.1653, + "grad_norm": 0.9259164416529385, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 16530 + }, + { + "epoch": 0.16531, + "grad_norm": 0.8681080010290746, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 16531 + }, + { + "epoch": 0.16532, + "grad_norm": 0.8127777297072766, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 16532 + }, + { + "epoch": 0.16533, + "grad_norm": 0.8678429048267168, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 16533 + }, + { + "epoch": 0.16534, + "grad_norm": 0.8985386617651252, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 16534 + }, + { + "epoch": 0.16535, + "grad_norm": 1.0167292675159916, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 16535 + }, + { + "epoch": 0.16536, + "grad_norm": 1.0698354837164155, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 16536 + }, + { + "epoch": 0.16537, + "grad_norm": 0.9633811963508263, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 16537 + }, + { + "epoch": 0.16538, + "grad_norm": 0.9497531805329531, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 16538 + }, + { + "epoch": 0.16539, + "grad_norm": 1.0383203381577468, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 16539 + }, + { + "epoch": 0.1654, + "grad_norm": 1.0997067760078798, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 16540 + }, + { + "epoch": 0.16541, + "grad_norm": 1.0736621453658377, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 16541 + }, + { + "epoch": 0.16542, + "grad_norm": 0.9040243208029298, + "learning_rate": 0.003, + "loss": 4.068, + "step": 16542 + }, + { + "epoch": 0.16543, + "grad_norm": 0.8621822194244528, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 16543 + }, + { + "epoch": 0.16544, + "grad_norm": 0.9084383717664533, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 16544 + }, + { + "epoch": 0.16545, + "grad_norm": 0.9375564496584654, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 16545 + }, + { + "epoch": 0.16546, + "grad_norm": 0.9801135091236702, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 16546 + }, + { + "epoch": 0.16547, + "grad_norm": 1.0640556603633693, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 16547 + }, + { + "epoch": 0.16548, + "grad_norm": 1.1697631645113367, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 16548 + }, + { + "epoch": 0.16549, + "grad_norm": 1.040338745911738, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 16549 + }, + { + "epoch": 0.1655, + "grad_norm": 1.152265149590593, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 16550 + }, + { + "epoch": 0.16551, + "grad_norm": 0.9438970192293612, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 16551 + }, + { + "epoch": 0.16552, + "grad_norm": 0.9472087474276947, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 16552 + }, + { + "epoch": 0.16553, + "grad_norm": 0.8642983983001351, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 16553 + }, + { + "epoch": 0.16554, + "grad_norm": 0.7846420637771692, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 16554 + }, + { + "epoch": 0.16555, + "grad_norm": 0.8344787520798734, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 16555 + }, + { + "epoch": 0.16556, + "grad_norm": 0.9895997187897928, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 16556 + }, + { + "epoch": 0.16557, + "grad_norm": 1.2059192307357727, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 16557 + }, + { + "epoch": 0.16558, + "grad_norm": 0.8108602019844887, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 16558 + }, + { + "epoch": 0.16559, + "grad_norm": 0.8079073457719984, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 16559 + }, + { + "epoch": 0.1656, + "grad_norm": 0.9325977252728038, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 16560 + }, + { + "epoch": 0.16561, + "grad_norm": 1.0383802288301007, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 16561 + }, + { + "epoch": 0.16562, + "grad_norm": 1.0914868529759103, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 16562 + }, + { + "epoch": 0.16563, + "grad_norm": 0.8461510347991466, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 16563 + }, + { + "epoch": 0.16564, + "grad_norm": 0.7545415344715569, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 16564 + }, + { + "epoch": 0.16565, + "grad_norm": 0.6071515416738734, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 16565 + }, + { + "epoch": 0.16566, + "grad_norm": 0.6269742992012324, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 16566 + }, + { + "epoch": 0.16567, + "grad_norm": 0.7214176988230578, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 16567 + }, + { + "epoch": 0.16568, + "grad_norm": 0.793749521150152, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 16568 + }, + { + "epoch": 0.16569, + "grad_norm": 0.8544473789887507, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 16569 + }, + { + "epoch": 0.1657, + "grad_norm": 0.8478055298450039, + "learning_rate": 0.003, + "loss": 4.058, + "step": 16570 + }, + { + "epoch": 0.16571, + "grad_norm": 0.8715598001196075, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 16571 + }, + { + "epoch": 0.16572, + "grad_norm": 1.035496007750827, + "learning_rate": 0.003, + "loss": 4.08, + "step": 16572 + }, + { + "epoch": 0.16573, + "grad_norm": 1.1043299237619633, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 16573 + }, + { + "epoch": 0.16574, + "grad_norm": 0.8472709139237138, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 16574 + }, + { + "epoch": 0.16575, + "grad_norm": 0.8707354926263987, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 16575 + }, + { + "epoch": 0.16576, + "grad_norm": 0.9025740564326497, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 16576 + }, + { + "epoch": 0.16577, + "grad_norm": 0.9134741559472201, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 16577 + }, + { + "epoch": 0.16578, + "grad_norm": 0.9904958243395321, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 16578 + }, + { + "epoch": 0.16579, + "grad_norm": 1.0378807489332245, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 16579 + }, + { + "epoch": 0.1658, + "grad_norm": 1.0165662059122322, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 16580 + }, + { + "epoch": 0.16581, + "grad_norm": 1.1727087192109356, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 16581 + }, + { + "epoch": 0.16582, + "grad_norm": 0.9098497869999458, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 16582 + }, + { + "epoch": 0.16583, + "grad_norm": 1.1607073860592807, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 16583 + }, + { + "epoch": 0.16584, + "grad_norm": 0.98421831903711, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 16584 + }, + { + "epoch": 0.16585, + "grad_norm": 1.0352122742152954, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 16585 + }, + { + "epoch": 0.16586, + "grad_norm": 1.0343020941022698, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 16586 + }, + { + "epoch": 0.16587, + "grad_norm": 0.9208642439125199, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 16587 + }, + { + "epoch": 0.16588, + "grad_norm": 0.988283427584848, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 16588 + }, + { + "epoch": 0.16589, + "grad_norm": 1.0723578577185815, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 16589 + }, + { + "epoch": 0.1659, + "grad_norm": 1.0865388838071601, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 16590 + }, + { + "epoch": 0.16591, + "grad_norm": 0.7835732754712912, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 16591 + }, + { + "epoch": 0.16592, + "grad_norm": 0.7080299673094684, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 16592 + }, + { + "epoch": 0.16593, + "grad_norm": 0.7952753455467056, + "learning_rate": 0.003, + "loss": 4.07, + "step": 16593 + }, + { + "epoch": 0.16594, + "grad_norm": 0.8025023491675204, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 16594 + }, + { + "epoch": 0.16595, + "grad_norm": 0.8434752063860484, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 16595 + }, + { + "epoch": 0.16596, + "grad_norm": 0.7576039699436765, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 16596 + }, + { + "epoch": 0.16597, + "grad_norm": 0.8040176359188829, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 16597 + }, + { + "epoch": 0.16598, + "grad_norm": 0.8108078582877047, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 16598 + }, + { + "epoch": 0.16599, + "grad_norm": 0.9098138069546619, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 16599 + }, + { + "epoch": 0.166, + "grad_norm": 1.1119201512737737, + "learning_rate": 0.003, + "loss": 4.035, + "step": 16600 + }, + { + "epoch": 0.16601, + "grad_norm": 1.1905088097032037, + "learning_rate": 0.003, + "loss": 4.1055, + "step": 16601 + }, + { + "epoch": 0.16602, + "grad_norm": 0.8303069600930817, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 16602 + }, + { + "epoch": 0.16603, + "grad_norm": 0.7689642488159598, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 16603 + }, + { + "epoch": 0.16604, + "grad_norm": 0.8280858237676337, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 16604 + }, + { + "epoch": 0.16605, + "grad_norm": 0.8886330843711158, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 16605 + }, + { + "epoch": 0.16606, + "grad_norm": 1.0974928983661676, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 16606 + }, + { + "epoch": 0.16607, + "grad_norm": 0.9574499041192838, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 16607 + }, + { + "epoch": 0.16608, + "grad_norm": 0.9010072595434921, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 16608 + }, + { + "epoch": 0.16609, + "grad_norm": 0.9643310481457389, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 16609 + }, + { + "epoch": 0.1661, + "grad_norm": 1.0793575735585432, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 16610 + }, + { + "epoch": 0.16611, + "grad_norm": 1.1066751960261507, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 16611 + }, + { + "epoch": 0.16612, + "grad_norm": 0.8653315805591723, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 16612 + }, + { + "epoch": 0.16613, + "grad_norm": 0.7261582929574493, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 16613 + }, + { + "epoch": 0.16614, + "grad_norm": 0.6829087825872896, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 16614 + }, + { + "epoch": 0.16615, + "grad_norm": 0.7054220761382889, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 16615 + }, + { + "epoch": 0.16616, + "grad_norm": 0.7378934716762852, + "learning_rate": 0.003, + "loss": 4.1173, + "step": 16616 + }, + { + "epoch": 0.16617, + "grad_norm": 0.785770852990258, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 16617 + }, + { + "epoch": 0.16618, + "grad_norm": 0.8678169089350518, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 16618 + }, + { + "epoch": 0.16619, + "grad_norm": 1.0215330215473912, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 16619 + }, + { + "epoch": 0.1662, + "grad_norm": 1.1641532489683266, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 16620 + }, + { + "epoch": 0.16621, + "grad_norm": 0.9076208909961722, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 16621 + }, + { + "epoch": 0.16622, + "grad_norm": 0.8762027267478063, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 16622 + }, + { + "epoch": 0.16623, + "grad_norm": 0.7619567616157618, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 16623 + }, + { + "epoch": 0.16624, + "grad_norm": 0.7046215411103308, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 16624 + }, + { + "epoch": 0.16625, + "grad_norm": 0.7429471799885354, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 16625 + }, + { + "epoch": 0.16626, + "grad_norm": 0.878992974495442, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 16626 + }, + { + "epoch": 0.16627, + "grad_norm": 0.9455512526890147, + "learning_rate": 0.003, + "loss": 4.076, + "step": 16627 + }, + { + "epoch": 0.16628, + "grad_norm": 1.0127633391896531, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 16628 + }, + { + "epoch": 0.16629, + "grad_norm": 0.9397277193198198, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 16629 + }, + { + "epoch": 0.1663, + "grad_norm": 0.9040521564205963, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 16630 + }, + { + "epoch": 0.16631, + "grad_norm": 0.8983255252938565, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 16631 + }, + { + "epoch": 0.16632, + "grad_norm": 0.8984472253596174, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 16632 + }, + { + "epoch": 0.16633, + "grad_norm": 1.0058179192726155, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 16633 + }, + { + "epoch": 0.16634, + "grad_norm": 1.0815490574704374, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 16634 + }, + { + "epoch": 0.16635, + "grad_norm": 0.8688003355942993, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 16635 + }, + { + "epoch": 0.16636, + "grad_norm": 0.7486544256271357, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 16636 + }, + { + "epoch": 0.16637, + "grad_norm": 0.6631626591427247, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 16637 + }, + { + "epoch": 0.16638, + "grad_norm": 0.6639624215462988, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 16638 + }, + { + "epoch": 0.16639, + "grad_norm": 0.7204267843529744, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 16639 + }, + { + "epoch": 0.1664, + "grad_norm": 0.7674416371335842, + "learning_rate": 0.003, + "loss": 4.088, + "step": 16640 + }, + { + "epoch": 0.16641, + "grad_norm": 0.7348282416012347, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 16641 + }, + { + "epoch": 0.16642, + "grad_norm": 0.6580299164326259, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 16642 + }, + { + "epoch": 0.16643, + "grad_norm": 0.7486574461277886, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 16643 + }, + { + "epoch": 0.16644, + "grad_norm": 0.8410891945812161, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 16644 + }, + { + "epoch": 0.16645, + "grad_norm": 0.9015567725756612, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 16645 + }, + { + "epoch": 0.16646, + "grad_norm": 0.9497849783573655, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 16646 + }, + { + "epoch": 0.16647, + "grad_norm": 1.044784560631695, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 16647 + }, + { + "epoch": 0.16648, + "grad_norm": 1.2408840364808897, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 16648 + }, + { + "epoch": 0.16649, + "grad_norm": 0.9118155901804015, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 16649 + }, + { + "epoch": 0.1665, + "grad_norm": 0.8503313840325569, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 16650 + }, + { + "epoch": 0.16651, + "grad_norm": 0.8852096406645081, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 16651 + }, + { + "epoch": 0.16652, + "grad_norm": 0.8274706247869366, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 16652 + }, + { + "epoch": 0.16653, + "grad_norm": 0.8336659895344166, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 16653 + }, + { + "epoch": 0.16654, + "grad_norm": 0.7964732585715956, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 16654 + }, + { + "epoch": 0.16655, + "grad_norm": 0.713936537676871, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 16655 + }, + { + "epoch": 0.16656, + "grad_norm": 0.7636968286517395, + "learning_rate": 0.003, + "loss": 4.046, + "step": 16656 + }, + { + "epoch": 0.16657, + "grad_norm": 0.9199190974969985, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 16657 + }, + { + "epoch": 0.16658, + "grad_norm": 1.1881161215787814, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 16658 + }, + { + "epoch": 0.16659, + "grad_norm": 0.8359390921559996, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 16659 + }, + { + "epoch": 0.1666, + "grad_norm": 0.6908175104092933, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 16660 + }, + { + "epoch": 0.16661, + "grad_norm": 0.7125203353404347, + "learning_rate": 0.003, + "loss": 4.04, + "step": 16661 + }, + { + "epoch": 0.16662, + "grad_norm": 0.7957518128079261, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 16662 + }, + { + "epoch": 0.16663, + "grad_norm": 0.8126516335374254, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 16663 + }, + { + "epoch": 0.16664, + "grad_norm": 0.8981446676842118, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 16664 + }, + { + "epoch": 0.16665, + "grad_norm": 0.9207710774603982, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 16665 + }, + { + "epoch": 0.16666, + "grad_norm": 1.0670631481523027, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 16666 + }, + { + "epoch": 0.16667, + "grad_norm": 1.1159190925384723, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 16667 + }, + { + "epoch": 0.16668, + "grad_norm": 0.8755316547655639, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 16668 + }, + { + "epoch": 0.16669, + "grad_norm": 0.823923889411584, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 16669 + }, + { + "epoch": 0.1667, + "grad_norm": 0.9523476313505719, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 16670 + }, + { + "epoch": 0.16671, + "grad_norm": 1.1044233384045328, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 16671 + }, + { + "epoch": 0.16672, + "grad_norm": 1.0050174126244413, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 16672 + }, + { + "epoch": 0.16673, + "grad_norm": 0.9198252547094358, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 16673 + }, + { + "epoch": 0.16674, + "grad_norm": 1.0250224458877255, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 16674 + }, + { + "epoch": 0.16675, + "grad_norm": 1.0674086051971232, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 16675 + }, + { + "epoch": 0.16676, + "grad_norm": 1.034548655186563, + "learning_rate": 0.003, + "loss": 4.059, + "step": 16676 + }, + { + "epoch": 0.16677, + "grad_norm": 0.8363754754531006, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 16677 + }, + { + "epoch": 0.16678, + "grad_norm": 1.0221453849310609, + "learning_rate": 0.003, + "loss": 4.085, + "step": 16678 + }, + { + "epoch": 0.16679, + "grad_norm": 1.2720385853628837, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 16679 + }, + { + "epoch": 0.1668, + "grad_norm": 0.8660987181035079, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 16680 + }, + { + "epoch": 0.16681, + "grad_norm": 0.8373537570517481, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 16681 + }, + { + "epoch": 0.16682, + "grad_norm": 0.7504840831491315, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 16682 + }, + { + "epoch": 0.16683, + "grad_norm": 0.8274429077104438, + "learning_rate": 0.003, + "loss": 4.068, + "step": 16683 + }, + { + "epoch": 0.16684, + "grad_norm": 0.8174868885434718, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 16684 + }, + { + "epoch": 0.16685, + "grad_norm": 0.6824686537776473, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 16685 + }, + { + "epoch": 0.16686, + "grad_norm": 0.6726549298968327, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 16686 + }, + { + "epoch": 0.16687, + "grad_norm": 0.7826364611423184, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 16687 + }, + { + "epoch": 0.16688, + "grad_norm": 0.8458249616171628, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 16688 + }, + { + "epoch": 0.16689, + "grad_norm": 0.892481267185803, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 16689 + }, + { + "epoch": 0.1669, + "grad_norm": 1.119182684290395, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 16690 + }, + { + "epoch": 0.16691, + "grad_norm": 1.0885676419308759, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 16691 + }, + { + "epoch": 0.16692, + "grad_norm": 0.8871868582661631, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 16692 + }, + { + "epoch": 0.16693, + "grad_norm": 0.9075311293445458, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 16693 + }, + { + "epoch": 0.16694, + "grad_norm": 0.8219362385852115, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 16694 + }, + { + "epoch": 0.16695, + "grad_norm": 0.7217350868604105, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 16695 + }, + { + "epoch": 0.16696, + "grad_norm": 0.6635000555575337, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 16696 + }, + { + "epoch": 0.16697, + "grad_norm": 0.7468625388147383, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 16697 + }, + { + "epoch": 0.16698, + "grad_norm": 0.7569472862038678, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 16698 + }, + { + "epoch": 0.16699, + "grad_norm": 0.688775075962298, + "learning_rate": 0.003, + "loss": 4.041, + "step": 16699 + }, + { + "epoch": 0.167, + "grad_norm": 0.7881719998264674, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 16700 + }, + { + "epoch": 0.16701, + "grad_norm": 0.9828491392111073, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 16701 + }, + { + "epoch": 0.16702, + "grad_norm": 1.1219184881620372, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 16702 + }, + { + "epoch": 0.16703, + "grad_norm": 1.030981913210529, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 16703 + }, + { + "epoch": 0.16704, + "grad_norm": 0.9494470020091834, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 16704 + }, + { + "epoch": 0.16705, + "grad_norm": 1.1174629570337802, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 16705 + }, + { + "epoch": 0.16706, + "grad_norm": 1.0142969666302373, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 16706 + }, + { + "epoch": 0.16707, + "grad_norm": 0.7572277314580239, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 16707 + }, + { + "epoch": 0.16708, + "grad_norm": 0.7004178767027441, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 16708 + }, + { + "epoch": 0.16709, + "grad_norm": 0.6831002573420436, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 16709 + }, + { + "epoch": 0.1671, + "grad_norm": 0.7516793822840889, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 16710 + }, + { + "epoch": 0.16711, + "grad_norm": 0.7018536133780285, + "learning_rate": 0.003, + "loss": 4.034, + "step": 16711 + }, + { + "epoch": 0.16712, + "grad_norm": 0.6719485607798802, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 16712 + }, + { + "epoch": 0.16713, + "grad_norm": 0.6965262460342196, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 16713 + }, + { + "epoch": 0.16714, + "grad_norm": 0.6724165567767315, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 16714 + }, + { + "epoch": 0.16715, + "grad_norm": 0.5872136714796434, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 16715 + }, + { + "epoch": 0.16716, + "grad_norm": 0.5840214977574147, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 16716 + }, + { + "epoch": 0.16717, + "grad_norm": 0.7265828835126165, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 16717 + }, + { + "epoch": 0.16718, + "grad_norm": 1.0963337524529386, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 16718 + }, + { + "epoch": 0.16719, + "grad_norm": 1.1380126288081818, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 16719 + }, + { + "epoch": 0.1672, + "grad_norm": 0.8385039946950212, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 16720 + }, + { + "epoch": 0.16721, + "grad_norm": 0.680709102532998, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 16721 + }, + { + "epoch": 0.16722, + "grad_norm": 0.653449249403669, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 16722 + }, + { + "epoch": 0.16723, + "grad_norm": 0.716150777958729, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 16723 + }, + { + "epoch": 0.16724, + "grad_norm": 0.7306702448232821, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 16724 + }, + { + "epoch": 0.16725, + "grad_norm": 0.9621919836828025, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 16725 + }, + { + "epoch": 0.16726, + "grad_norm": 1.1186628933956517, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 16726 + }, + { + "epoch": 0.16727, + "grad_norm": 0.7780106037705267, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 16727 + }, + { + "epoch": 0.16728, + "grad_norm": 0.853706730233926, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 16728 + }, + { + "epoch": 0.16729, + "grad_norm": 0.9424718190049373, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 16729 + }, + { + "epoch": 0.1673, + "grad_norm": 1.122484521377639, + "learning_rate": 0.003, + "loss": 4.12, + "step": 16730 + }, + { + "epoch": 0.16731, + "grad_norm": 1.0243242933080432, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 16731 + }, + { + "epoch": 0.16732, + "grad_norm": 1.0680102268558531, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 16732 + }, + { + "epoch": 0.16733, + "grad_norm": 1.0557716208896646, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 16733 + }, + { + "epoch": 0.16734, + "grad_norm": 1.0717652968490432, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 16734 + }, + { + "epoch": 0.16735, + "grad_norm": 1.0822275586531358, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 16735 + }, + { + "epoch": 0.16736, + "grad_norm": 1.039304281817275, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 16736 + }, + { + "epoch": 0.16737, + "grad_norm": 1.1432653601237608, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 16737 + }, + { + "epoch": 0.16738, + "grad_norm": 0.9824314171564776, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 16738 + }, + { + "epoch": 0.16739, + "grad_norm": 0.9860950685151864, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 16739 + }, + { + "epoch": 0.1674, + "grad_norm": 1.0728495562678955, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 16740 + }, + { + "epoch": 0.16741, + "grad_norm": 1.0102484941139138, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 16741 + }, + { + "epoch": 0.16742, + "grad_norm": 1.0639365017792461, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 16742 + }, + { + "epoch": 0.16743, + "grad_norm": 1.0673694705953947, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 16743 + }, + { + "epoch": 0.16744, + "grad_norm": 1.1906288768068023, + "learning_rate": 0.003, + "loss": 4.1095, + "step": 16744 + }, + { + "epoch": 0.16745, + "grad_norm": 1.0099529214659242, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 16745 + }, + { + "epoch": 0.16746, + "grad_norm": 0.7696819389456069, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 16746 + }, + { + "epoch": 0.16747, + "grad_norm": 0.7087115276953534, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 16747 + }, + { + "epoch": 0.16748, + "grad_norm": 0.6618824327696065, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 16748 + }, + { + "epoch": 0.16749, + "grad_norm": 0.7209320568784208, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 16749 + }, + { + "epoch": 0.1675, + "grad_norm": 0.64605340164109, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 16750 + }, + { + "epoch": 0.16751, + "grad_norm": 0.6514167008229829, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 16751 + }, + { + "epoch": 0.16752, + "grad_norm": 0.6984598851722965, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 16752 + }, + { + "epoch": 0.16753, + "grad_norm": 0.806041428263782, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 16753 + }, + { + "epoch": 0.16754, + "grad_norm": 0.8372914861193819, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 16754 + }, + { + "epoch": 0.16755, + "grad_norm": 0.8948825987007641, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 16755 + }, + { + "epoch": 0.16756, + "grad_norm": 0.9066786063886627, + "learning_rate": 0.003, + "loss": 4.035, + "step": 16756 + }, + { + "epoch": 0.16757, + "grad_norm": 0.8809460789147684, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 16757 + }, + { + "epoch": 0.16758, + "grad_norm": 0.9161379809246357, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 16758 + }, + { + "epoch": 0.16759, + "grad_norm": 0.8440855019154663, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 16759 + }, + { + "epoch": 0.1676, + "grad_norm": 0.8539885715059418, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 16760 + }, + { + "epoch": 0.16761, + "grad_norm": 0.9609780072492677, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 16761 + }, + { + "epoch": 0.16762, + "grad_norm": 1.1560485532817966, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 16762 + }, + { + "epoch": 0.16763, + "grad_norm": 0.8631234786759063, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 16763 + }, + { + "epoch": 0.16764, + "grad_norm": 0.8246721104258516, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 16764 + }, + { + "epoch": 0.16765, + "grad_norm": 0.7935901119471722, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 16765 + }, + { + "epoch": 0.16766, + "grad_norm": 0.7896718099564398, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 16766 + }, + { + "epoch": 0.16767, + "grad_norm": 0.8989561638589851, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 16767 + }, + { + "epoch": 0.16768, + "grad_norm": 0.8084974032941903, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 16768 + }, + { + "epoch": 0.16769, + "grad_norm": 0.6636195747898832, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 16769 + }, + { + "epoch": 0.1677, + "grad_norm": 0.6090637753516286, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 16770 + }, + { + "epoch": 0.16771, + "grad_norm": 0.716113509832652, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 16771 + }, + { + "epoch": 0.16772, + "grad_norm": 1.0752000136954618, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 16772 + }, + { + "epoch": 0.16773, + "grad_norm": 1.2568956870851655, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 16773 + }, + { + "epoch": 0.16774, + "grad_norm": 0.690306332321971, + "learning_rate": 0.003, + "loss": 4.068, + "step": 16774 + }, + { + "epoch": 0.16775, + "grad_norm": 0.7003821794820632, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 16775 + }, + { + "epoch": 0.16776, + "grad_norm": 0.7885371785917205, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 16776 + }, + { + "epoch": 0.16777, + "grad_norm": 0.7484215582500324, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 16777 + }, + { + "epoch": 0.16778, + "grad_norm": 0.8139857628190452, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 16778 + }, + { + "epoch": 0.16779, + "grad_norm": 0.7820292388934653, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 16779 + }, + { + "epoch": 0.1678, + "grad_norm": 0.8103586517041421, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 16780 + }, + { + "epoch": 0.16781, + "grad_norm": 0.9732764987293661, + "learning_rate": 0.003, + "loss": 4.1115, + "step": 16781 + }, + { + "epoch": 0.16782, + "grad_norm": 1.0508315937354935, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 16782 + }, + { + "epoch": 0.16783, + "grad_norm": 1.1351229332308257, + "learning_rate": 0.003, + "loss": 4.093, + "step": 16783 + }, + { + "epoch": 0.16784, + "grad_norm": 0.9168068840896803, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 16784 + }, + { + "epoch": 0.16785, + "grad_norm": 0.8953446696867371, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 16785 + }, + { + "epoch": 0.16786, + "grad_norm": 0.8365816798135379, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 16786 + }, + { + "epoch": 0.16787, + "grad_norm": 0.8323806854912139, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 16787 + }, + { + "epoch": 0.16788, + "grad_norm": 0.9497142736006731, + "learning_rate": 0.003, + "loss": 4.095, + "step": 16788 + }, + { + "epoch": 0.16789, + "grad_norm": 1.096878713543592, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 16789 + }, + { + "epoch": 0.1679, + "grad_norm": 1.1031005450161495, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 16790 + }, + { + "epoch": 0.16791, + "grad_norm": 0.8738495470403181, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 16791 + }, + { + "epoch": 0.16792, + "grad_norm": 0.9152539305550742, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 16792 + }, + { + "epoch": 0.16793, + "grad_norm": 0.9043708831672599, + "learning_rate": 0.003, + "loss": 4.058, + "step": 16793 + }, + { + "epoch": 0.16794, + "grad_norm": 0.875149090923743, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 16794 + }, + { + "epoch": 0.16795, + "grad_norm": 0.8435276843851752, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 16795 + }, + { + "epoch": 0.16796, + "grad_norm": 0.8694933853930026, + "learning_rate": 0.003, + "loss": 4.035, + "step": 16796 + }, + { + "epoch": 0.16797, + "grad_norm": 0.8594162744869177, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 16797 + }, + { + "epoch": 0.16798, + "grad_norm": 0.8335938110380197, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 16798 + }, + { + "epoch": 0.16799, + "grad_norm": 0.8401286558201116, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 16799 + }, + { + "epoch": 0.168, + "grad_norm": 1.0148988626211617, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 16800 + }, + { + "epoch": 0.16801, + "grad_norm": 1.1784713719652615, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 16801 + }, + { + "epoch": 0.16802, + "grad_norm": 0.9581177199857444, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 16802 + }, + { + "epoch": 0.16803, + "grad_norm": 0.938081288603496, + "learning_rate": 0.003, + "loss": 4.106, + "step": 16803 + }, + { + "epoch": 0.16804, + "grad_norm": 0.9632259496080161, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 16804 + }, + { + "epoch": 0.16805, + "grad_norm": 0.934841255896249, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 16805 + }, + { + "epoch": 0.16806, + "grad_norm": 0.901274662548973, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 16806 + }, + { + "epoch": 0.16807, + "grad_norm": 0.9566533578116455, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 16807 + }, + { + "epoch": 0.16808, + "grad_norm": 0.9186767560775385, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 16808 + }, + { + "epoch": 0.16809, + "grad_norm": 0.9062104024577505, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 16809 + }, + { + "epoch": 0.1681, + "grad_norm": 0.9360000010205733, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 16810 + }, + { + "epoch": 0.16811, + "grad_norm": 1.130700436145991, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 16811 + }, + { + "epoch": 0.16812, + "grad_norm": 1.0286298599655, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 16812 + }, + { + "epoch": 0.16813, + "grad_norm": 1.0227472417847407, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 16813 + }, + { + "epoch": 0.16814, + "grad_norm": 0.9291237381618836, + "learning_rate": 0.003, + "loss": 4.05, + "step": 16814 + }, + { + "epoch": 0.16815, + "grad_norm": 0.7853585844044454, + "learning_rate": 0.003, + "loss": 4.111, + "step": 16815 + }, + { + "epoch": 0.16816, + "grad_norm": 0.7630891220068708, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 16816 + }, + { + "epoch": 0.16817, + "grad_norm": 0.8176219127342664, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 16817 + }, + { + "epoch": 0.16818, + "grad_norm": 0.8442687593616304, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 16818 + }, + { + "epoch": 0.16819, + "grad_norm": 0.7733673292555855, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 16819 + }, + { + "epoch": 0.1682, + "grad_norm": 0.7667584541217659, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 16820 + }, + { + "epoch": 0.16821, + "grad_norm": 0.7720845035105838, + "learning_rate": 0.003, + "loss": 4.034, + "step": 16821 + }, + { + "epoch": 0.16822, + "grad_norm": 0.7949675295815203, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 16822 + }, + { + "epoch": 0.16823, + "grad_norm": 0.845912537748116, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 16823 + }, + { + "epoch": 0.16824, + "grad_norm": 0.9190522756506045, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 16824 + }, + { + "epoch": 0.16825, + "grad_norm": 1.0767841771935078, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 16825 + }, + { + "epoch": 0.16826, + "grad_norm": 0.960922952409664, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 16826 + }, + { + "epoch": 0.16827, + "grad_norm": 0.812503792352589, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 16827 + }, + { + "epoch": 0.16828, + "grad_norm": 0.7715450756967425, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 16828 + }, + { + "epoch": 0.16829, + "grad_norm": 0.7138493846893461, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 16829 + }, + { + "epoch": 0.1683, + "grad_norm": 0.7559171902088815, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 16830 + }, + { + "epoch": 0.16831, + "grad_norm": 0.8454544722767993, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 16831 + }, + { + "epoch": 0.16832, + "grad_norm": 0.947185485561565, + "learning_rate": 0.003, + "loss": 4.059, + "step": 16832 + }, + { + "epoch": 0.16833, + "grad_norm": 1.177057153893723, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 16833 + }, + { + "epoch": 0.16834, + "grad_norm": 0.8277736530030745, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 16834 + }, + { + "epoch": 0.16835, + "grad_norm": 0.8403656624243396, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 16835 + }, + { + "epoch": 0.16836, + "grad_norm": 0.9701520335255543, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 16836 + }, + { + "epoch": 0.16837, + "grad_norm": 0.9429902242346855, + "learning_rate": 0.003, + "loss": 4.063, + "step": 16837 + }, + { + "epoch": 0.16838, + "grad_norm": 0.8886462276062898, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 16838 + }, + { + "epoch": 0.16839, + "grad_norm": 0.8376289744620194, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 16839 + }, + { + "epoch": 0.1684, + "grad_norm": 0.7963345186826152, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 16840 + }, + { + "epoch": 0.16841, + "grad_norm": 0.8549701938022071, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 16841 + }, + { + "epoch": 0.16842, + "grad_norm": 0.9960237379242167, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 16842 + }, + { + "epoch": 0.16843, + "grad_norm": 1.193643044924253, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 16843 + }, + { + "epoch": 0.16844, + "grad_norm": 0.9164143389429955, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 16844 + }, + { + "epoch": 0.16845, + "grad_norm": 0.8538321551106992, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 16845 + }, + { + "epoch": 0.16846, + "grad_norm": 0.9043740316393933, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 16846 + }, + { + "epoch": 0.16847, + "grad_norm": 0.9090509100819298, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 16847 + }, + { + "epoch": 0.16848, + "grad_norm": 0.9047829484022039, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 16848 + }, + { + "epoch": 0.16849, + "grad_norm": 0.9713622214580283, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 16849 + }, + { + "epoch": 0.1685, + "grad_norm": 1.039021503963327, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 16850 + }, + { + "epoch": 0.16851, + "grad_norm": 1.0507703816270693, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 16851 + }, + { + "epoch": 0.16852, + "grad_norm": 0.8394518194000056, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 16852 + }, + { + "epoch": 0.16853, + "grad_norm": 1.0097475429211138, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 16853 + }, + { + "epoch": 0.16854, + "grad_norm": 1.1792438245766939, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 16854 + }, + { + "epoch": 0.16855, + "grad_norm": 0.8313906406345619, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 16855 + }, + { + "epoch": 0.16856, + "grad_norm": 0.7988631433459336, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 16856 + }, + { + "epoch": 0.16857, + "grad_norm": 0.9343265695967697, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 16857 + }, + { + "epoch": 0.16858, + "grad_norm": 1.0242695154794679, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 16858 + }, + { + "epoch": 0.16859, + "grad_norm": 0.9642330942979943, + "learning_rate": 0.003, + "loss": 4.064, + "step": 16859 + }, + { + "epoch": 0.1686, + "grad_norm": 0.8937438287868996, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 16860 + }, + { + "epoch": 0.16861, + "grad_norm": 0.752504664058674, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 16861 + }, + { + "epoch": 0.16862, + "grad_norm": 0.6544359707823012, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 16862 + }, + { + "epoch": 0.16863, + "grad_norm": 0.5945977205078491, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 16863 + }, + { + "epoch": 0.16864, + "grad_norm": 0.6396058325438535, + "learning_rate": 0.003, + "loss": 4.059, + "step": 16864 + }, + { + "epoch": 0.16865, + "grad_norm": 0.7140251684892681, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 16865 + }, + { + "epoch": 0.16866, + "grad_norm": 0.9480995066162021, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 16866 + }, + { + "epoch": 0.16867, + "grad_norm": 1.128713407564246, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 16867 + }, + { + "epoch": 0.16868, + "grad_norm": 1.0667592928487957, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 16868 + }, + { + "epoch": 0.16869, + "grad_norm": 0.8599187547357423, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 16869 + }, + { + "epoch": 0.1687, + "grad_norm": 0.720566056469061, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 16870 + }, + { + "epoch": 0.16871, + "grad_norm": 0.7918365376640389, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 16871 + }, + { + "epoch": 0.16872, + "grad_norm": 0.8167634892644354, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 16872 + }, + { + "epoch": 0.16873, + "grad_norm": 0.7492730495927574, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 16873 + }, + { + "epoch": 0.16874, + "grad_norm": 0.8433626659332906, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 16874 + }, + { + "epoch": 0.16875, + "grad_norm": 1.1800267673653257, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 16875 + }, + { + "epoch": 0.16876, + "grad_norm": 0.7818492223452557, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 16876 + }, + { + "epoch": 0.16877, + "grad_norm": 0.6623675644844721, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 16877 + }, + { + "epoch": 0.16878, + "grad_norm": 0.7037544274431712, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 16878 + }, + { + "epoch": 0.16879, + "grad_norm": 0.6733415151107367, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 16879 + }, + { + "epoch": 0.1688, + "grad_norm": 0.6994694575696391, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 16880 + }, + { + "epoch": 0.16881, + "grad_norm": 0.6649583705484471, + "learning_rate": 0.003, + "loss": 4.045, + "step": 16881 + }, + { + "epoch": 0.16882, + "grad_norm": 0.6816005272499164, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 16882 + }, + { + "epoch": 0.16883, + "grad_norm": 0.7513439213100401, + "learning_rate": 0.003, + "loss": 4.07, + "step": 16883 + }, + { + "epoch": 0.16884, + "grad_norm": 0.7662707567485666, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 16884 + }, + { + "epoch": 0.16885, + "grad_norm": 0.8737869709380984, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 16885 + }, + { + "epoch": 0.16886, + "grad_norm": 1.0491863553098975, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 16886 + }, + { + "epoch": 0.16887, + "grad_norm": 0.9325895461294644, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 16887 + }, + { + "epoch": 0.16888, + "grad_norm": 0.9386174844283485, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 16888 + }, + { + "epoch": 0.16889, + "grad_norm": 0.9615205005299152, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 16889 + }, + { + "epoch": 0.1689, + "grad_norm": 0.824528194402005, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 16890 + }, + { + "epoch": 0.16891, + "grad_norm": 0.8956823000248313, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 16891 + }, + { + "epoch": 0.16892, + "grad_norm": 1.0088333248260806, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 16892 + }, + { + "epoch": 0.16893, + "grad_norm": 1.0957559904151173, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 16893 + }, + { + "epoch": 0.16894, + "grad_norm": 0.9732333595934364, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 16894 + }, + { + "epoch": 0.16895, + "grad_norm": 1.0542448399915376, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 16895 + }, + { + "epoch": 0.16896, + "grad_norm": 1.0619033548809447, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 16896 + }, + { + "epoch": 0.16897, + "grad_norm": 1.0510395230679006, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 16897 + }, + { + "epoch": 0.16898, + "grad_norm": 1.0410623036910303, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 16898 + }, + { + "epoch": 0.16899, + "grad_norm": 0.9883500016051606, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 16899 + }, + { + "epoch": 0.169, + "grad_norm": 1.0462935391497992, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 16900 + }, + { + "epoch": 0.16901, + "grad_norm": 0.9338345911769358, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 16901 + }, + { + "epoch": 0.16902, + "grad_norm": 0.9126746141590983, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 16902 + }, + { + "epoch": 0.16903, + "grad_norm": 1.0236213603262032, + "learning_rate": 0.003, + "loss": 4.067, + "step": 16903 + }, + { + "epoch": 0.16904, + "grad_norm": 1.0459999430308622, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 16904 + }, + { + "epoch": 0.16905, + "grad_norm": 0.8938821231306587, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 16905 + }, + { + "epoch": 0.16906, + "grad_norm": 0.8681891094987683, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 16906 + }, + { + "epoch": 0.16907, + "grad_norm": 0.9247683188024998, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 16907 + }, + { + "epoch": 0.16908, + "grad_norm": 0.8913802332168116, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 16908 + }, + { + "epoch": 0.16909, + "grad_norm": 0.9217980419563984, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 16909 + }, + { + "epoch": 0.1691, + "grad_norm": 0.995531226179752, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 16910 + }, + { + "epoch": 0.16911, + "grad_norm": 1.0025440270873596, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 16911 + }, + { + "epoch": 0.16912, + "grad_norm": 1.0363056443442558, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 16912 + }, + { + "epoch": 0.16913, + "grad_norm": 0.9892047324839762, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 16913 + }, + { + "epoch": 0.16914, + "grad_norm": 1.0864104874845777, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 16914 + }, + { + "epoch": 0.16915, + "grad_norm": 0.843171475146014, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 16915 + }, + { + "epoch": 0.16916, + "grad_norm": 0.8561440496616702, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 16916 + }, + { + "epoch": 0.16917, + "grad_norm": 0.8163104147218143, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 16917 + }, + { + "epoch": 0.16918, + "grad_norm": 0.7246679817960276, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 16918 + }, + { + "epoch": 0.16919, + "grad_norm": 0.6402709068614789, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 16919 + }, + { + "epoch": 0.1692, + "grad_norm": 0.6744517276472858, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 16920 + }, + { + "epoch": 0.16921, + "grad_norm": 0.6548234665914778, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 16921 + }, + { + "epoch": 0.16922, + "grad_norm": 0.8375466753682494, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 16922 + }, + { + "epoch": 0.16923, + "grad_norm": 1.1733751383402757, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 16923 + }, + { + "epoch": 0.16924, + "grad_norm": 1.0683641233583197, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 16924 + }, + { + "epoch": 0.16925, + "grad_norm": 0.8290224063060054, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 16925 + }, + { + "epoch": 0.16926, + "grad_norm": 0.8198511691450935, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 16926 + }, + { + "epoch": 0.16927, + "grad_norm": 0.8725762743683417, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 16927 + }, + { + "epoch": 0.16928, + "grad_norm": 0.8838825462665559, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 16928 + }, + { + "epoch": 0.16929, + "grad_norm": 0.9107745212778506, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 16929 + }, + { + "epoch": 0.1693, + "grad_norm": 1.0482183640735203, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 16930 + }, + { + "epoch": 0.16931, + "grad_norm": 1.1633738422282487, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 16931 + }, + { + "epoch": 0.16932, + "grad_norm": 0.8834701888199146, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 16932 + }, + { + "epoch": 0.16933, + "grad_norm": 0.7874448060794419, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 16933 + }, + { + "epoch": 0.16934, + "grad_norm": 0.752700718602391, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 16934 + }, + { + "epoch": 0.16935, + "grad_norm": 0.7261182902276447, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 16935 + }, + { + "epoch": 0.16936, + "grad_norm": 0.5815443494154847, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 16936 + }, + { + "epoch": 0.16937, + "grad_norm": 0.5413130876992222, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 16937 + }, + { + "epoch": 0.16938, + "grad_norm": 0.7148824713935833, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 16938 + }, + { + "epoch": 0.16939, + "grad_norm": 0.8113751917587134, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 16939 + }, + { + "epoch": 0.1694, + "grad_norm": 0.8331120584804382, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 16940 + }, + { + "epoch": 0.16941, + "grad_norm": 0.8783155396073156, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 16941 + }, + { + "epoch": 0.16942, + "grad_norm": 1.0229106091113127, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 16942 + }, + { + "epoch": 0.16943, + "grad_norm": 0.9913001669954191, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 16943 + }, + { + "epoch": 0.16944, + "grad_norm": 0.8444409482864496, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 16944 + }, + { + "epoch": 0.16945, + "grad_norm": 0.8183763055959205, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 16945 + }, + { + "epoch": 0.16946, + "grad_norm": 0.9617361905409932, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 16946 + }, + { + "epoch": 0.16947, + "grad_norm": 1.186039695712203, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 16947 + }, + { + "epoch": 0.16948, + "grad_norm": 0.9797662129440634, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 16948 + }, + { + "epoch": 0.16949, + "grad_norm": 0.8866228193879615, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 16949 + }, + { + "epoch": 0.1695, + "grad_norm": 0.9455632193764086, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 16950 + }, + { + "epoch": 0.16951, + "grad_norm": 0.9845815656400316, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 16951 + }, + { + "epoch": 0.16952, + "grad_norm": 0.9698630666524807, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 16952 + }, + { + "epoch": 0.16953, + "grad_norm": 0.9263444676583883, + "learning_rate": 0.003, + "loss": 4.07, + "step": 16953 + }, + { + "epoch": 0.16954, + "grad_norm": 0.9877001745289326, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 16954 + }, + { + "epoch": 0.16955, + "grad_norm": 1.0565279828511083, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 16955 + }, + { + "epoch": 0.16956, + "grad_norm": 0.8518040471049105, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 16956 + }, + { + "epoch": 0.16957, + "grad_norm": 0.7375777017857805, + "learning_rate": 0.003, + "loss": 4.077, + "step": 16957 + }, + { + "epoch": 0.16958, + "grad_norm": 0.6535865882538622, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 16958 + }, + { + "epoch": 0.16959, + "grad_norm": 0.6095598143028842, + "learning_rate": 0.003, + "loss": 4.04, + "step": 16959 + }, + { + "epoch": 0.1696, + "grad_norm": 0.6236536986198375, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 16960 + }, + { + "epoch": 0.16961, + "grad_norm": 0.6409798092580766, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 16961 + }, + { + "epoch": 0.16962, + "grad_norm": 0.8309216838697241, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 16962 + }, + { + "epoch": 0.16963, + "grad_norm": 0.9791274783309264, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 16963 + }, + { + "epoch": 0.16964, + "grad_norm": 0.9180726458211976, + "learning_rate": 0.003, + "loss": 4.09, + "step": 16964 + }, + { + "epoch": 0.16965, + "grad_norm": 0.80150772996483, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 16965 + }, + { + "epoch": 0.16966, + "grad_norm": 0.7810169671486233, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 16966 + }, + { + "epoch": 0.16967, + "grad_norm": 0.835625859098452, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 16967 + }, + { + "epoch": 0.16968, + "grad_norm": 0.7935090509567666, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 16968 + }, + { + "epoch": 0.16969, + "grad_norm": 0.6853971941848793, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 16969 + }, + { + "epoch": 0.1697, + "grad_norm": 0.686601229050851, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 16970 + }, + { + "epoch": 0.16971, + "grad_norm": 0.6647273955786298, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 16971 + }, + { + "epoch": 0.16972, + "grad_norm": 0.8557843940767302, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 16972 + }, + { + "epoch": 0.16973, + "grad_norm": 1.3058272509003233, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 16973 + }, + { + "epoch": 0.16974, + "grad_norm": 1.1413326711907645, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 16974 + }, + { + "epoch": 0.16975, + "grad_norm": 0.8381572260259262, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 16975 + }, + { + "epoch": 0.16976, + "grad_norm": 0.7782455868370698, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 16976 + }, + { + "epoch": 0.16977, + "grad_norm": 0.9132732569395081, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 16977 + }, + { + "epoch": 0.16978, + "grad_norm": 1.0169807904158978, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 16978 + }, + { + "epoch": 0.16979, + "grad_norm": 0.7758826977474127, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 16979 + }, + { + "epoch": 0.1698, + "grad_norm": 0.7800625323486636, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 16980 + }, + { + "epoch": 0.16981, + "grad_norm": 0.9805905272868272, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 16981 + }, + { + "epoch": 0.16982, + "grad_norm": 1.20335505975646, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 16982 + }, + { + "epoch": 0.16983, + "grad_norm": 0.8596746358564572, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 16983 + }, + { + "epoch": 0.16984, + "grad_norm": 0.8592945046367677, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 16984 + }, + { + "epoch": 0.16985, + "grad_norm": 0.7542904586121594, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 16985 + }, + { + "epoch": 0.16986, + "grad_norm": 0.7917575943374668, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 16986 + }, + { + "epoch": 0.16987, + "grad_norm": 0.9634107763726459, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 16987 + }, + { + "epoch": 0.16988, + "grad_norm": 1.2190631152217408, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 16988 + }, + { + "epoch": 0.16989, + "grad_norm": 1.08970278598891, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 16989 + }, + { + "epoch": 0.1699, + "grad_norm": 0.9868812206834904, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 16990 + }, + { + "epoch": 0.16991, + "grad_norm": 0.9841322262971984, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 16991 + }, + { + "epoch": 0.16992, + "grad_norm": 1.0484360604769196, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 16992 + }, + { + "epoch": 0.16993, + "grad_norm": 0.9260306894912795, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 16993 + }, + { + "epoch": 0.16994, + "grad_norm": 0.9066405327558062, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 16994 + }, + { + "epoch": 0.16995, + "grad_norm": 1.0348207431462089, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 16995 + }, + { + "epoch": 0.16996, + "grad_norm": 0.9191441797665568, + "learning_rate": 0.003, + "loss": 4.054, + "step": 16996 + }, + { + "epoch": 0.16997, + "grad_norm": 0.730506594938041, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 16997 + }, + { + "epoch": 0.16998, + "grad_norm": 0.639203737255759, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 16998 + }, + { + "epoch": 0.16999, + "grad_norm": 0.6657379714702873, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 16999 + }, + { + "epoch": 0.17, + "grad_norm": 0.775351235315513, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 17000 + }, + { + "epoch": 0.17001, + "grad_norm": 0.7991960468995907, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 17001 + }, + { + "epoch": 0.17002, + "grad_norm": 0.87198689526849, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 17002 + }, + { + "epoch": 0.17003, + "grad_norm": 0.9285139097854309, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 17003 + }, + { + "epoch": 0.17004, + "grad_norm": 0.7932451824493771, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 17004 + }, + { + "epoch": 0.17005, + "grad_norm": 0.7035403585610972, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 17005 + }, + { + "epoch": 0.17006, + "grad_norm": 0.740581962118741, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 17006 + }, + { + "epoch": 0.17007, + "grad_norm": 0.8592684849435771, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 17007 + }, + { + "epoch": 0.17008, + "grad_norm": 1.108864136209052, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 17008 + }, + { + "epoch": 0.17009, + "grad_norm": 1.0887487399263103, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 17009 + }, + { + "epoch": 0.1701, + "grad_norm": 0.902741685828884, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 17010 + }, + { + "epoch": 0.17011, + "grad_norm": 0.9621009107416355, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 17011 + }, + { + "epoch": 0.17012, + "grad_norm": 0.9067105355899577, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 17012 + }, + { + "epoch": 0.17013, + "grad_norm": 0.8505921216731109, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 17013 + }, + { + "epoch": 0.17014, + "grad_norm": 0.8078186108669917, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 17014 + }, + { + "epoch": 0.17015, + "grad_norm": 0.8912264605475723, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 17015 + }, + { + "epoch": 0.17016, + "grad_norm": 0.8849887110904964, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 17016 + }, + { + "epoch": 0.17017, + "grad_norm": 0.8490462547011455, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 17017 + }, + { + "epoch": 0.17018, + "grad_norm": 0.8282127625312425, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 17018 + }, + { + "epoch": 0.17019, + "grad_norm": 0.7833936367868143, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 17019 + }, + { + "epoch": 0.1702, + "grad_norm": 0.7466115154262697, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 17020 + }, + { + "epoch": 0.17021, + "grad_norm": 0.7434084377161972, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 17021 + }, + { + "epoch": 0.17022, + "grad_norm": 0.779901566712444, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 17022 + }, + { + "epoch": 0.17023, + "grad_norm": 0.8029196783649748, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 17023 + }, + { + "epoch": 0.17024, + "grad_norm": 0.8266168430994425, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 17024 + }, + { + "epoch": 0.17025, + "grad_norm": 0.8070319728002024, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 17025 + }, + { + "epoch": 0.17026, + "grad_norm": 0.7384919804956861, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 17026 + }, + { + "epoch": 0.17027, + "grad_norm": 0.9538535182299456, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 17027 + }, + { + "epoch": 0.17028, + "grad_norm": 1.2190767863757057, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 17028 + }, + { + "epoch": 0.17029, + "grad_norm": 0.9038072468944277, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 17029 + }, + { + "epoch": 0.1703, + "grad_norm": 0.8002975089514168, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 17030 + }, + { + "epoch": 0.17031, + "grad_norm": 0.8229617505492536, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 17031 + }, + { + "epoch": 0.17032, + "grad_norm": 0.8853914211914459, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 17032 + }, + { + "epoch": 0.17033, + "grad_norm": 1.2535034786768737, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 17033 + }, + { + "epoch": 0.17034, + "grad_norm": 1.1634495331454608, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 17034 + }, + { + "epoch": 0.17035, + "grad_norm": 0.9770624289561806, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 17035 + }, + { + "epoch": 0.17036, + "grad_norm": 0.9636810900500455, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 17036 + }, + { + "epoch": 0.17037, + "grad_norm": 0.9677334965571742, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 17037 + }, + { + "epoch": 0.17038, + "grad_norm": 1.0137456362567268, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 17038 + }, + { + "epoch": 0.17039, + "grad_norm": 0.9311855857985665, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 17039 + }, + { + "epoch": 0.1704, + "grad_norm": 0.8705171238203692, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 17040 + }, + { + "epoch": 0.17041, + "grad_norm": 0.9120460747983312, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 17041 + }, + { + "epoch": 0.17042, + "grad_norm": 0.9000611445305694, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 17042 + }, + { + "epoch": 0.17043, + "grad_norm": 0.9149092235683032, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 17043 + }, + { + "epoch": 0.17044, + "grad_norm": 0.9581749747685351, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 17044 + }, + { + "epoch": 0.17045, + "grad_norm": 1.110605387279622, + "learning_rate": 0.003, + "loss": 4.088, + "step": 17045 + }, + { + "epoch": 0.17046, + "grad_norm": 1.0004888932585712, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 17046 + }, + { + "epoch": 0.17047, + "grad_norm": 1.1423249502789607, + "learning_rate": 0.003, + "loss": 4.087, + "step": 17047 + }, + { + "epoch": 0.17048, + "grad_norm": 1.0049720562907372, + "learning_rate": 0.003, + "loss": 4.101, + "step": 17048 + }, + { + "epoch": 0.17049, + "grad_norm": 0.9518434773031919, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 17049 + }, + { + "epoch": 0.1705, + "grad_norm": 0.8336815905772503, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 17050 + }, + { + "epoch": 0.17051, + "grad_norm": 0.8464438414693694, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 17051 + }, + { + "epoch": 0.17052, + "grad_norm": 0.7407423122925679, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 17052 + }, + { + "epoch": 0.17053, + "grad_norm": 0.693597921646556, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 17053 + }, + { + "epoch": 0.17054, + "grad_norm": 0.7224347609205675, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 17054 + }, + { + "epoch": 0.17055, + "grad_norm": 0.727109434885961, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 17055 + }, + { + "epoch": 0.17056, + "grad_norm": 0.7085929468906867, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 17056 + }, + { + "epoch": 0.17057, + "grad_norm": 0.8981748123460447, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 17057 + }, + { + "epoch": 0.17058, + "grad_norm": 2.2373522381619755, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 17058 + }, + { + "epoch": 0.17059, + "grad_norm": 0.9093744697365344, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 17059 + }, + { + "epoch": 0.1706, + "grad_norm": 1.1277623734429008, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 17060 + }, + { + "epoch": 0.17061, + "grad_norm": 1.5430590966089344, + "learning_rate": 0.003, + "loss": 4.1294, + "step": 17061 + }, + { + "epoch": 0.17062, + "grad_norm": 2.038189289617899, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 17062 + }, + { + "epoch": 0.17063, + "grad_norm": 1.2126336199709753, + "learning_rate": 0.003, + "loss": 4.1344, + "step": 17063 + }, + { + "epoch": 0.17064, + "grad_norm": 1.5421381235140816, + "learning_rate": 0.003, + "loss": 4.1432, + "step": 17064 + }, + { + "epoch": 0.17065, + "grad_norm": 1.1488223955809493, + "learning_rate": 0.003, + "loss": 4.1663, + "step": 17065 + }, + { + "epoch": 0.17066, + "grad_norm": 1.207433546749588, + "learning_rate": 0.003, + "loss": 4.1518, + "step": 17066 + }, + { + "epoch": 0.17067, + "grad_norm": 1.0140043474005396, + "learning_rate": 0.003, + "loss": 4.1684, + "step": 17067 + }, + { + "epoch": 0.17068, + "grad_norm": 1.188130617619142, + "learning_rate": 0.003, + "loss": 4.1824, + "step": 17068 + }, + { + "epoch": 0.17069, + "grad_norm": 0.919306923692545, + "learning_rate": 0.003, + "loss": 4.1696, + "step": 17069 + }, + { + "epoch": 0.1707, + "grad_norm": 0.9866630603168417, + "learning_rate": 0.003, + "loss": 4.1475, + "step": 17070 + }, + { + "epoch": 0.17071, + "grad_norm": 1.329511049506572, + "learning_rate": 0.003, + "loss": 4.1381, + "step": 17071 + }, + { + "epoch": 0.17072, + "grad_norm": 0.9106345385856193, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 17072 + }, + { + "epoch": 0.17073, + "grad_norm": 1.1764960321502664, + "learning_rate": 0.003, + "loss": 4.1519, + "step": 17073 + }, + { + "epoch": 0.17074, + "grad_norm": 1.2598383828849962, + "learning_rate": 0.003, + "loss": 4.1498, + "step": 17074 + }, + { + "epoch": 0.17075, + "grad_norm": 1.0509211538495036, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 17075 + }, + { + "epoch": 0.17076, + "grad_norm": 0.9102646245035444, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 17076 + }, + { + "epoch": 0.17077, + "grad_norm": 0.8591655765967405, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 17077 + }, + { + "epoch": 0.17078, + "grad_norm": 0.9587677674346851, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 17078 + }, + { + "epoch": 0.17079, + "grad_norm": 0.9367928599234552, + "learning_rate": 0.003, + "loss": 4.118, + "step": 17079 + }, + { + "epoch": 0.1708, + "grad_norm": 1.0437382963401483, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 17080 + }, + { + "epoch": 0.17081, + "grad_norm": 1.1473030302395222, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 17081 + }, + { + "epoch": 0.17082, + "grad_norm": 0.9106070465700841, + "learning_rate": 0.003, + "loss": 4.1496, + "step": 17082 + }, + { + "epoch": 0.17083, + "grad_norm": 0.8471341271322517, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 17083 + }, + { + "epoch": 0.17084, + "grad_norm": 0.7975919978994044, + "learning_rate": 0.003, + "loss": 4.1122, + "step": 17084 + }, + { + "epoch": 0.17085, + "grad_norm": 0.7186499293677238, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 17085 + }, + { + "epoch": 0.17086, + "grad_norm": 0.6105221851269854, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 17086 + }, + { + "epoch": 0.17087, + "grad_norm": 0.5659929406154532, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 17087 + }, + { + "epoch": 0.17088, + "grad_norm": 0.5785630679244134, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 17088 + }, + { + "epoch": 0.17089, + "grad_norm": 0.5387770406885679, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 17089 + }, + { + "epoch": 0.1709, + "grad_norm": 0.5224281719319868, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 17090 + }, + { + "epoch": 0.17091, + "grad_norm": 0.5466749487337251, + "learning_rate": 0.003, + "loss": 4.1053, + "step": 17091 + }, + { + "epoch": 0.17092, + "grad_norm": 0.5773224442912067, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 17092 + }, + { + "epoch": 0.17093, + "grad_norm": 0.6079574601308142, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 17093 + }, + { + "epoch": 0.17094, + "grad_norm": 0.7256985184918995, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 17094 + }, + { + "epoch": 0.17095, + "grad_norm": 0.9023129629485823, + "learning_rate": 0.003, + "loss": 4.055, + "step": 17095 + }, + { + "epoch": 0.17096, + "grad_norm": 1.0036109631300578, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 17096 + }, + { + "epoch": 0.17097, + "grad_norm": 0.9691106360794101, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 17097 + }, + { + "epoch": 0.17098, + "grad_norm": 0.8144532410996149, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 17098 + }, + { + "epoch": 0.17099, + "grad_norm": 0.8244833413472513, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 17099 + }, + { + "epoch": 0.171, + "grad_norm": 0.8515963531151998, + "learning_rate": 0.003, + "loss": 4.07, + "step": 17100 + }, + { + "epoch": 0.17101, + "grad_norm": 0.8307615043135952, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 17101 + }, + { + "epoch": 0.17102, + "grad_norm": 0.8218764246903447, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 17102 + }, + { + "epoch": 0.17103, + "grad_norm": 1.0301794365666588, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 17103 + }, + { + "epoch": 0.17104, + "grad_norm": 1.165111959697851, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 17104 + }, + { + "epoch": 0.17105, + "grad_norm": 0.7810556750976614, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 17105 + }, + { + "epoch": 0.17106, + "grad_norm": 0.6791549396745541, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 17106 + }, + { + "epoch": 0.17107, + "grad_norm": 0.6623671796945579, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 17107 + }, + { + "epoch": 0.17108, + "grad_norm": 0.626084325194252, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 17108 + }, + { + "epoch": 0.17109, + "grad_norm": 0.7191946119738822, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 17109 + }, + { + "epoch": 0.1711, + "grad_norm": 0.7556421999131269, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 17110 + }, + { + "epoch": 0.17111, + "grad_norm": 0.892806033351293, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 17111 + }, + { + "epoch": 0.17112, + "grad_norm": 0.9300008384526975, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 17112 + }, + { + "epoch": 0.17113, + "grad_norm": 0.7855435882492539, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 17113 + }, + { + "epoch": 0.17114, + "grad_norm": 0.5585246965743592, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 17114 + }, + { + "epoch": 0.17115, + "grad_norm": 0.5980668243524988, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 17115 + }, + { + "epoch": 0.17116, + "grad_norm": 0.7086760225422278, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 17116 + }, + { + "epoch": 0.17117, + "grad_norm": 0.729464383261198, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 17117 + }, + { + "epoch": 0.17118, + "grad_norm": 0.7507987290637065, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 17118 + }, + { + "epoch": 0.17119, + "grad_norm": 0.7513906769526895, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 17119 + }, + { + "epoch": 0.1712, + "grad_norm": 0.867350690912068, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 17120 + }, + { + "epoch": 0.17121, + "grad_norm": 0.9372292709027423, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 17121 + }, + { + "epoch": 0.17122, + "grad_norm": 1.0368161169014245, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 17122 + }, + { + "epoch": 0.17123, + "grad_norm": 0.9395479308947648, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 17123 + }, + { + "epoch": 0.17124, + "grad_norm": 0.7663719934384913, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 17124 + }, + { + "epoch": 0.17125, + "grad_norm": 0.8749188981054825, + "learning_rate": 0.003, + "loss": 4.097, + "step": 17125 + }, + { + "epoch": 0.17126, + "grad_norm": 0.8560411118521094, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 17126 + }, + { + "epoch": 0.17127, + "grad_norm": 0.7120271980266843, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 17127 + }, + { + "epoch": 0.17128, + "grad_norm": 0.6213948093478162, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 17128 + }, + { + "epoch": 0.17129, + "grad_norm": 0.66528896986041, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 17129 + }, + { + "epoch": 0.1713, + "grad_norm": 0.6503010118308575, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 17130 + }, + { + "epoch": 0.17131, + "grad_norm": 0.7130942176247402, + "learning_rate": 0.003, + "loss": 4.046, + "step": 17131 + }, + { + "epoch": 0.17132, + "grad_norm": 0.7998220976445287, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 17132 + }, + { + "epoch": 0.17133, + "grad_norm": 1.1540708912237823, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 17133 + }, + { + "epoch": 0.17134, + "grad_norm": 1.1640268003139134, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 17134 + }, + { + "epoch": 0.17135, + "grad_norm": 0.7680178275501996, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 17135 + }, + { + "epoch": 0.17136, + "grad_norm": 0.7458487994792794, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 17136 + }, + { + "epoch": 0.17137, + "grad_norm": 0.743231006541354, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 17137 + }, + { + "epoch": 0.17138, + "grad_norm": 0.8241515608769896, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 17138 + }, + { + "epoch": 0.17139, + "grad_norm": 0.8841436926148614, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 17139 + }, + { + "epoch": 0.1714, + "grad_norm": 0.926111263878185, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 17140 + }, + { + "epoch": 0.17141, + "grad_norm": 1.003759581953196, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 17141 + }, + { + "epoch": 0.17142, + "grad_norm": 1.0681413291701944, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 17142 + }, + { + "epoch": 0.17143, + "grad_norm": 0.8920035370304565, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 17143 + }, + { + "epoch": 0.17144, + "grad_norm": 0.8063588549762267, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 17144 + }, + { + "epoch": 0.17145, + "grad_norm": 0.8201650322026025, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 17145 + }, + { + "epoch": 0.17146, + "grad_norm": 0.8139812085778229, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 17146 + }, + { + "epoch": 0.17147, + "grad_norm": 0.7137289999997846, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 17147 + }, + { + "epoch": 0.17148, + "grad_norm": 0.6362790283551898, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 17148 + }, + { + "epoch": 0.17149, + "grad_norm": 0.7363548722007726, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 17149 + }, + { + "epoch": 0.1715, + "grad_norm": 1.0749319646285522, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 17150 + }, + { + "epoch": 0.17151, + "grad_norm": 1.1873761681442723, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 17151 + }, + { + "epoch": 0.17152, + "grad_norm": 0.6517262037252678, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 17152 + }, + { + "epoch": 0.17153, + "grad_norm": 0.6456980554673047, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 17153 + }, + { + "epoch": 0.17154, + "grad_norm": 0.6401664283315516, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 17154 + }, + { + "epoch": 0.17155, + "grad_norm": 0.6479454585183663, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 17155 + }, + { + "epoch": 0.17156, + "grad_norm": 0.7063221325480135, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 17156 + }, + { + "epoch": 0.17157, + "grad_norm": 0.7449691162562174, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 17157 + }, + { + "epoch": 0.17158, + "grad_norm": 0.7688052210946774, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 17158 + }, + { + "epoch": 0.17159, + "grad_norm": 0.8663240449410309, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 17159 + }, + { + "epoch": 0.1716, + "grad_norm": 0.9350470754519611, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 17160 + }, + { + "epoch": 0.17161, + "grad_norm": 1.105918303174733, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 17161 + }, + { + "epoch": 0.17162, + "grad_norm": 0.9582373552828001, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 17162 + }, + { + "epoch": 0.17163, + "grad_norm": 0.8519330079536576, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 17163 + }, + { + "epoch": 0.17164, + "grad_norm": 0.7843883129538464, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 17164 + }, + { + "epoch": 0.17165, + "grad_norm": 0.8759360611959321, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 17165 + }, + { + "epoch": 0.17166, + "grad_norm": 0.9449295350433374, + "learning_rate": 0.003, + "loss": 4.073, + "step": 17166 + }, + { + "epoch": 0.17167, + "grad_norm": 0.9685718447274639, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 17167 + }, + { + "epoch": 0.17168, + "grad_norm": 0.8829189093930374, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 17168 + }, + { + "epoch": 0.17169, + "grad_norm": 1.0172291499134838, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 17169 + }, + { + "epoch": 0.1717, + "grad_norm": 1.031474876754636, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 17170 + }, + { + "epoch": 0.17171, + "grad_norm": 1.0627549761853843, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 17171 + }, + { + "epoch": 0.17172, + "grad_norm": 1.192244620493509, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 17172 + }, + { + "epoch": 0.17173, + "grad_norm": 0.9696467606689188, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 17173 + }, + { + "epoch": 0.17174, + "grad_norm": 1.0099346721484064, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 17174 + }, + { + "epoch": 0.17175, + "grad_norm": 0.9430570332335579, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 17175 + }, + { + "epoch": 0.17176, + "grad_norm": 0.7416781895159501, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 17176 + }, + { + "epoch": 0.17177, + "grad_norm": 0.5362280596041956, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 17177 + }, + { + "epoch": 0.17178, + "grad_norm": 0.5213637027578487, + "learning_rate": 0.003, + "loss": 4.064, + "step": 17178 + }, + { + "epoch": 0.17179, + "grad_norm": 0.5910661842224878, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 17179 + }, + { + "epoch": 0.1718, + "grad_norm": 0.7203841849058878, + "learning_rate": 0.003, + "loss": 4.076, + "step": 17180 + }, + { + "epoch": 0.17181, + "grad_norm": 0.9174065527544728, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 17181 + }, + { + "epoch": 0.17182, + "grad_norm": 1.1515831223067883, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 17182 + }, + { + "epoch": 0.17183, + "grad_norm": 0.7875882064641271, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 17183 + }, + { + "epoch": 0.17184, + "grad_norm": 0.6113362737273356, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 17184 + }, + { + "epoch": 0.17185, + "grad_norm": 0.6769858413569317, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 17185 + }, + { + "epoch": 0.17186, + "grad_norm": 0.7857685015948951, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 17186 + }, + { + "epoch": 0.17187, + "grad_norm": 0.8887083327904423, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 17187 + }, + { + "epoch": 0.17188, + "grad_norm": 0.9355285903796494, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 17188 + }, + { + "epoch": 0.17189, + "grad_norm": 0.9699610435380733, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 17189 + }, + { + "epoch": 0.1719, + "grad_norm": 0.968892520735324, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 17190 + }, + { + "epoch": 0.17191, + "grad_norm": 0.9278366759031452, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 17191 + }, + { + "epoch": 0.17192, + "grad_norm": 1.0093456528697768, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 17192 + }, + { + "epoch": 0.17193, + "grad_norm": 0.962765750270637, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 17193 + }, + { + "epoch": 0.17194, + "grad_norm": 1.015938938634198, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 17194 + }, + { + "epoch": 0.17195, + "grad_norm": 1.0202078310085068, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 17195 + }, + { + "epoch": 0.17196, + "grad_norm": 0.7320971398547693, + "learning_rate": 0.003, + "loss": 4.037, + "step": 17196 + }, + { + "epoch": 0.17197, + "grad_norm": 0.6858718177060639, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 17197 + }, + { + "epoch": 0.17198, + "grad_norm": 0.8182467796436648, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 17198 + }, + { + "epoch": 0.17199, + "grad_norm": 0.8029616233649419, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 17199 + }, + { + "epoch": 0.172, + "grad_norm": 0.8529230097017862, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 17200 + }, + { + "epoch": 0.17201, + "grad_norm": 0.8384852167472535, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 17201 + }, + { + "epoch": 0.17202, + "grad_norm": 0.8417210218463609, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 17202 + }, + { + "epoch": 0.17203, + "grad_norm": 1.0840588938958944, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 17203 + }, + { + "epoch": 0.17204, + "grad_norm": 1.062999691063719, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 17204 + }, + { + "epoch": 0.17205, + "grad_norm": 0.8714578232781969, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 17205 + }, + { + "epoch": 0.17206, + "grad_norm": 0.8553273735258046, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 17206 + }, + { + "epoch": 0.17207, + "grad_norm": 0.8002136610055574, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 17207 + }, + { + "epoch": 0.17208, + "grad_norm": 0.7176815025210549, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 17208 + }, + { + "epoch": 0.17209, + "grad_norm": 0.6207109948968895, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 17209 + }, + { + "epoch": 0.1721, + "grad_norm": 0.6247885292327184, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 17210 + }, + { + "epoch": 0.17211, + "grad_norm": 0.6722672956311609, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 17211 + }, + { + "epoch": 0.17212, + "grad_norm": 0.6949125408976792, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 17212 + }, + { + "epoch": 0.17213, + "grad_norm": 0.8528555681479099, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 17213 + }, + { + "epoch": 0.17214, + "grad_norm": 0.9799468799478441, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 17214 + }, + { + "epoch": 0.17215, + "grad_norm": 1.2014577690733408, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 17215 + }, + { + "epoch": 0.17216, + "grad_norm": 1.0459659173443419, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 17216 + }, + { + "epoch": 0.17217, + "grad_norm": 1.0545798776296638, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 17217 + }, + { + "epoch": 0.17218, + "grad_norm": 1.009583047859804, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 17218 + }, + { + "epoch": 0.17219, + "grad_norm": 1.0471807416764718, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 17219 + }, + { + "epoch": 0.1722, + "grad_norm": 0.8585335653507692, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 17220 + }, + { + "epoch": 0.17221, + "grad_norm": 0.848650943483469, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 17221 + }, + { + "epoch": 0.17222, + "grad_norm": 0.9650349686441796, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 17222 + }, + { + "epoch": 0.17223, + "grad_norm": 0.985307049479663, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 17223 + }, + { + "epoch": 0.17224, + "grad_norm": 0.9812775977592947, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 17224 + }, + { + "epoch": 0.17225, + "grad_norm": 0.9237673151249101, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 17225 + }, + { + "epoch": 0.17226, + "grad_norm": 0.8417361738016759, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 17226 + }, + { + "epoch": 0.17227, + "grad_norm": 0.8717303731859316, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 17227 + }, + { + "epoch": 0.17228, + "grad_norm": 0.8886103465575612, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 17228 + }, + { + "epoch": 0.17229, + "grad_norm": 0.8754470496635101, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 17229 + }, + { + "epoch": 0.1723, + "grad_norm": 0.8245545788155056, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 17230 + }, + { + "epoch": 0.17231, + "grad_norm": 0.7937219643903761, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 17231 + }, + { + "epoch": 0.17232, + "grad_norm": 0.7994651748103063, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 17232 + }, + { + "epoch": 0.17233, + "grad_norm": 0.9901505651795912, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 17233 + }, + { + "epoch": 0.17234, + "grad_norm": 1.1057812119967034, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 17234 + }, + { + "epoch": 0.17235, + "grad_norm": 1.0501265727346045, + "learning_rate": 0.003, + "loss": 4.044, + "step": 17235 + }, + { + "epoch": 0.17236, + "grad_norm": 0.9980172470686348, + "learning_rate": 0.003, + "loss": 4.103, + "step": 17236 + }, + { + "epoch": 0.17237, + "grad_norm": 1.055511422736484, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 17237 + }, + { + "epoch": 0.17238, + "grad_norm": 1.0280707586992661, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 17238 + }, + { + "epoch": 0.17239, + "grad_norm": 0.9969897409512702, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 17239 + }, + { + "epoch": 0.1724, + "grad_norm": 0.8827205706856681, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 17240 + }, + { + "epoch": 0.17241, + "grad_norm": 0.8572914366402444, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 17241 + }, + { + "epoch": 0.17242, + "grad_norm": 0.9145285684578551, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 17242 + }, + { + "epoch": 0.17243, + "grad_norm": 0.9361469788165563, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 17243 + }, + { + "epoch": 0.17244, + "grad_norm": 1.0484950899124539, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 17244 + }, + { + "epoch": 0.17245, + "grad_norm": 0.9460009857292923, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 17245 + }, + { + "epoch": 0.17246, + "grad_norm": 0.8691325082971929, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 17246 + }, + { + "epoch": 0.17247, + "grad_norm": 0.833335179981703, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 17247 + }, + { + "epoch": 0.17248, + "grad_norm": 0.7813119985619414, + "learning_rate": 0.003, + "loss": 4.083, + "step": 17248 + }, + { + "epoch": 0.17249, + "grad_norm": 0.7580442016495044, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 17249 + }, + { + "epoch": 0.1725, + "grad_norm": 0.8757261798240211, + "learning_rate": 0.003, + "loss": 4.087, + "step": 17250 + }, + { + "epoch": 0.17251, + "grad_norm": 0.8324547581132363, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 17251 + }, + { + "epoch": 0.17252, + "grad_norm": 0.7270042248527319, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 17252 + }, + { + "epoch": 0.17253, + "grad_norm": 0.7199955731550661, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 17253 + }, + { + "epoch": 0.17254, + "grad_norm": 0.7646752198671325, + "learning_rate": 0.003, + "loss": 4.027, + "step": 17254 + }, + { + "epoch": 0.17255, + "grad_norm": 0.8604880952711906, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 17255 + }, + { + "epoch": 0.17256, + "grad_norm": 0.9151988992040045, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 17256 + }, + { + "epoch": 0.17257, + "grad_norm": 0.8548837132435793, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 17257 + }, + { + "epoch": 0.17258, + "grad_norm": 0.6261999558389855, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 17258 + }, + { + "epoch": 0.17259, + "grad_norm": 0.5410557986097894, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 17259 + }, + { + "epoch": 0.1726, + "grad_norm": 0.624676544911726, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 17260 + }, + { + "epoch": 0.17261, + "grad_norm": 0.7859287744538213, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 17261 + }, + { + "epoch": 0.17262, + "grad_norm": 0.8624456349182722, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 17262 + }, + { + "epoch": 0.17263, + "grad_norm": 0.8731087489875774, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 17263 + }, + { + "epoch": 0.17264, + "grad_norm": 0.8091508299507951, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 17264 + }, + { + "epoch": 0.17265, + "grad_norm": 0.7110748362336592, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 17265 + }, + { + "epoch": 0.17266, + "grad_norm": 0.7232852377630583, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 17266 + }, + { + "epoch": 0.17267, + "grad_norm": 0.8192577509209505, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 17267 + }, + { + "epoch": 0.17268, + "grad_norm": 0.8469306064141809, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 17268 + }, + { + "epoch": 0.17269, + "grad_norm": 0.8655466936923366, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 17269 + }, + { + "epoch": 0.1727, + "grad_norm": 0.9277575011427177, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 17270 + }, + { + "epoch": 0.17271, + "grad_norm": 1.1779658679488627, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 17271 + }, + { + "epoch": 0.17272, + "grad_norm": 0.9689316268022872, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 17272 + }, + { + "epoch": 0.17273, + "grad_norm": 0.8243911467881477, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 17273 + }, + { + "epoch": 0.17274, + "grad_norm": 0.7626622528624412, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 17274 + }, + { + "epoch": 0.17275, + "grad_norm": 0.7991502451912672, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 17275 + }, + { + "epoch": 0.17276, + "grad_norm": 0.7410218460367459, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 17276 + }, + { + "epoch": 0.17277, + "grad_norm": 0.7888996169821576, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 17277 + }, + { + "epoch": 0.17278, + "grad_norm": 0.8333273603853033, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 17278 + }, + { + "epoch": 0.17279, + "grad_norm": 0.8617489204975749, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 17279 + }, + { + "epoch": 0.1728, + "grad_norm": 0.9461896957998942, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 17280 + }, + { + "epoch": 0.17281, + "grad_norm": 0.9875240275964974, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 17281 + }, + { + "epoch": 0.17282, + "grad_norm": 1.0286590722274698, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 17282 + }, + { + "epoch": 0.17283, + "grad_norm": 1.0599897332944377, + "learning_rate": 0.003, + "loss": 4.072, + "step": 17283 + }, + { + "epoch": 0.17284, + "grad_norm": 1.0120046528501239, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 17284 + }, + { + "epoch": 0.17285, + "grad_norm": 0.8778888577381524, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 17285 + }, + { + "epoch": 0.17286, + "grad_norm": 0.7973275547969494, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 17286 + }, + { + "epoch": 0.17287, + "grad_norm": 0.8452580436387402, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 17287 + }, + { + "epoch": 0.17288, + "grad_norm": 0.8505597531306186, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 17288 + }, + { + "epoch": 0.17289, + "grad_norm": 0.8317333037306122, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 17289 + }, + { + "epoch": 0.1729, + "grad_norm": 1.008993386006391, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 17290 + }, + { + "epoch": 0.17291, + "grad_norm": 1.114491086597863, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 17291 + }, + { + "epoch": 0.17292, + "grad_norm": 0.9808995520484136, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 17292 + }, + { + "epoch": 0.17293, + "grad_norm": 1.0501849633682052, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 17293 + }, + { + "epoch": 0.17294, + "grad_norm": 0.9247467345092111, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 17294 + }, + { + "epoch": 0.17295, + "grad_norm": 0.8879352488478393, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 17295 + }, + { + "epoch": 0.17296, + "grad_norm": 0.8544677992278479, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 17296 + }, + { + "epoch": 0.17297, + "grad_norm": 0.9801565895747134, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 17297 + }, + { + "epoch": 0.17298, + "grad_norm": 1.3174293108271267, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 17298 + }, + { + "epoch": 0.17299, + "grad_norm": 0.8574921353754716, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 17299 + }, + { + "epoch": 0.173, + "grad_norm": 0.9528195091626303, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 17300 + }, + { + "epoch": 0.17301, + "grad_norm": 0.8429109507322878, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 17301 + }, + { + "epoch": 0.17302, + "grad_norm": 0.7462237690461228, + "learning_rate": 0.003, + "loss": 4.067, + "step": 17302 + }, + { + "epoch": 0.17303, + "grad_norm": 0.795428100025416, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 17303 + }, + { + "epoch": 0.17304, + "grad_norm": 0.75221237706329, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 17304 + }, + { + "epoch": 0.17305, + "grad_norm": 0.8322306659265789, + "learning_rate": 0.003, + "loss": 4.053, + "step": 17305 + }, + { + "epoch": 0.17306, + "grad_norm": 0.8282803772155906, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 17306 + }, + { + "epoch": 0.17307, + "grad_norm": 0.8147818327391885, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 17307 + }, + { + "epoch": 0.17308, + "grad_norm": 0.6959409443735983, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 17308 + }, + { + "epoch": 0.17309, + "grad_norm": 0.8336328467593661, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 17309 + }, + { + "epoch": 0.1731, + "grad_norm": 1.216790763242895, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 17310 + }, + { + "epoch": 0.17311, + "grad_norm": 1.1392406129899704, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 17311 + }, + { + "epoch": 0.17312, + "grad_norm": 0.7787514008594593, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 17312 + }, + { + "epoch": 0.17313, + "grad_norm": 0.6446576291053752, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 17313 + }, + { + "epoch": 0.17314, + "grad_norm": 0.7132887345769777, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 17314 + }, + { + "epoch": 0.17315, + "grad_norm": 0.6951002272597862, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 17315 + }, + { + "epoch": 0.17316, + "grad_norm": 0.7157765569698302, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 17316 + }, + { + "epoch": 0.17317, + "grad_norm": 0.8041583097445666, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 17317 + }, + { + "epoch": 0.17318, + "grad_norm": 1.0373881978167832, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 17318 + }, + { + "epoch": 0.17319, + "grad_norm": 1.0578100222463147, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 17319 + }, + { + "epoch": 0.1732, + "grad_norm": 0.8543196037379642, + "learning_rate": 0.003, + "loss": 4.047, + "step": 17320 + }, + { + "epoch": 0.17321, + "grad_norm": 0.8089396912332841, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 17321 + }, + { + "epoch": 0.17322, + "grad_norm": 0.7506929583233183, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 17322 + }, + { + "epoch": 0.17323, + "grad_norm": 0.7580967782892147, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 17323 + }, + { + "epoch": 0.17324, + "grad_norm": 0.7019955689096509, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 17324 + }, + { + "epoch": 0.17325, + "grad_norm": 0.8004041472192007, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 17325 + }, + { + "epoch": 0.17326, + "grad_norm": 0.906514365447904, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 17326 + }, + { + "epoch": 0.17327, + "grad_norm": 1.0761067587509687, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 17327 + }, + { + "epoch": 0.17328, + "grad_norm": 0.9990357087343317, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 17328 + }, + { + "epoch": 0.17329, + "grad_norm": 0.9859818289448217, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 17329 + }, + { + "epoch": 0.1733, + "grad_norm": 1.0741354235531915, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 17330 + }, + { + "epoch": 0.17331, + "grad_norm": 0.8676919064567706, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 17331 + }, + { + "epoch": 0.17332, + "grad_norm": 0.7908551989976313, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 17332 + }, + { + "epoch": 0.17333, + "grad_norm": 0.7377602197492152, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 17333 + }, + { + "epoch": 0.17334, + "grad_norm": 0.8150042288786687, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 17334 + }, + { + "epoch": 0.17335, + "grad_norm": 0.6685684565585098, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 17335 + }, + { + "epoch": 0.17336, + "grad_norm": 0.6184537779101107, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 17336 + }, + { + "epoch": 0.17337, + "grad_norm": 0.6336402118718607, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 17337 + }, + { + "epoch": 0.17338, + "grad_norm": 0.7629785540495987, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 17338 + }, + { + "epoch": 0.17339, + "grad_norm": 0.7654657701343637, + "learning_rate": 0.003, + "loss": 4.075, + "step": 17339 + }, + { + "epoch": 0.1734, + "grad_norm": 0.7814302560778268, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 17340 + }, + { + "epoch": 0.17341, + "grad_norm": 1.0116673944445607, + "learning_rate": 0.003, + "loss": 4.04, + "step": 17341 + }, + { + "epoch": 0.17342, + "grad_norm": 1.3156796022672015, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 17342 + }, + { + "epoch": 0.17343, + "grad_norm": 0.6835139520659282, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 17343 + }, + { + "epoch": 0.17344, + "grad_norm": 0.653366109440773, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 17344 + }, + { + "epoch": 0.17345, + "grad_norm": 0.7199124076803893, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 17345 + }, + { + "epoch": 0.17346, + "grad_norm": 0.7325160021916057, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 17346 + }, + { + "epoch": 0.17347, + "grad_norm": 0.758612119806983, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 17347 + }, + { + "epoch": 0.17348, + "grad_norm": 0.7589921271569143, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 17348 + }, + { + "epoch": 0.17349, + "grad_norm": 0.7258675784146073, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 17349 + }, + { + "epoch": 0.1735, + "grad_norm": 0.7771286353223587, + "learning_rate": 0.003, + "loss": 4.044, + "step": 17350 + }, + { + "epoch": 0.17351, + "grad_norm": 0.7997971082043684, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 17351 + }, + { + "epoch": 0.17352, + "grad_norm": 0.9160256675432547, + "learning_rate": 0.003, + "loss": 4.056, + "step": 17352 + }, + { + "epoch": 0.17353, + "grad_norm": 0.9831985554775784, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 17353 + }, + { + "epoch": 0.17354, + "grad_norm": 0.920199886221615, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 17354 + }, + { + "epoch": 0.17355, + "grad_norm": 0.9193153173406821, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 17355 + }, + { + "epoch": 0.17356, + "grad_norm": 0.9952068634206899, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 17356 + }, + { + "epoch": 0.17357, + "grad_norm": 1.0809133928846821, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 17357 + }, + { + "epoch": 0.17358, + "grad_norm": 0.9751393261045358, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 17358 + }, + { + "epoch": 0.17359, + "grad_norm": 1.0974371466481527, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 17359 + }, + { + "epoch": 0.1736, + "grad_norm": 1.2407129187930825, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 17360 + }, + { + "epoch": 0.17361, + "grad_norm": 0.912345854382728, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 17361 + }, + { + "epoch": 0.17362, + "grad_norm": 1.0023088642121027, + "learning_rate": 0.003, + "loss": 4.073, + "step": 17362 + }, + { + "epoch": 0.17363, + "grad_norm": 1.1697749023106154, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 17363 + }, + { + "epoch": 0.17364, + "grad_norm": 0.852100101056053, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 17364 + }, + { + "epoch": 0.17365, + "grad_norm": 0.8037060442021475, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 17365 + }, + { + "epoch": 0.17366, + "grad_norm": 0.9146820611097617, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 17366 + }, + { + "epoch": 0.17367, + "grad_norm": 0.9773714867247653, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 17367 + }, + { + "epoch": 0.17368, + "grad_norm": 0.9700324896984772, + "learning_rate": 0.003, + "loss": 4.043, + "step": 17368 + }, + { + "epoch": 0.17369, + "grad_norm": 0.9282414802865684, + "learning_rate": 0.003, + "loss": 4.071, + "step": 17369 + }, + { + "epoch": 0.1737, + "grad_norm": 0.9687659983616759, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 17370 + }, + { + "epoch": 0.17371, + "grad_norm": 0.9981063118178767, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 17371 + }, + { + "epoch": 0.17372, + "grad_norm": 1.1171285873302719, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 17372 + }, + { + "epoch": 0.17373, + "grad_norm": 1.1013248404196918, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 17373 + }, + { + "epoch": 0.17374, + "grad_norm": 1.046819789427008, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 17374 + }, + { + "epoch": 0.17375, + "grad_norm": 0.9488223410056068, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 17375 + }, + { + "epoch": 0.17376, + "grad_norm": 0.8939021914190054, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 17376 + }, + { + "epoch": 0.17377, + "grad_norm": 0.8071698421728923, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 17377 + }, + { + "epoch": 0.17378, + "grad_norm": 0.7831473096804777, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 17378 + }, + { + "epoch": 0.17379, + "grad_norm": 0.7593526077225092, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 17379 + }, + { + "epoch": 0.1738, + "grad_norm": 0.780580847798873, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 17380 + }, + { + "epoch": 0.17381, + "grad_norm": 0.8454248660344981, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 17381 + }, + { + "epoch": 0.17382, + "grad_norm": 0.9889286500472355, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 17382 + }, + { + "epoch": 0.17383, + "grad_norm": 0.9074023780782959, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 17383 + }, + { + "epoch": 0.17384, + "grad_norm": 0.8810076460238118, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 17384 + }, + { + "epoch": 0.17385, + "grad_norm": 0.8953587302784513, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 17385 + }, + { + "epoch": 0.17386, + "grad_norm": 0.9250273848925091, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 17386 + }, + { + "epoch": 0.17387, + "grad_norm": 0.8891341176753061, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 17387 + }, + { + "epoch": 0.17388, + "grad_norm": 0.7799053717768336, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 17388 + }, + { + "epoch": 0.17389, + "grad_norm": 0.7298546868214523, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 17389 + }, + { + "epoch": 0.1739, + "grad_norm": 0.7766456205453687, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 17390 + }, + { + "epoch": 0.17391, + "grad_norm": 0.7500950132298999, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 17391 + }, + { + "epoch": 0.17392, + "grad_norm": 0.7613533634802031, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 17392 + }, + { + "epoch": 0.17393, + "grad_norm": 0.8939331275223772, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 17393 + }, + { + "epoch": 0.17394, + "grad_norm": 0.9631422829173715, + "learning_rate": 0.003, + "loss": 4.078, + "step": 17394 + }, + { + "epoch": 0.17395, + "grad_norm": 1.0997327445199163, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 17395 + }, + { + "epoch": 0.17396, + "grad_norm": 0.9850191451235975, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 17396 + }, + { + "epoch": 0.17397, + "grad_norm": 0.9921984540290422, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 17397 + }, + { + "epoch": 0.17398, + "grad_norm": 0.9958769014421154, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 17398 + }, + { + "epoch": 0.17399, + "grad_norm": 1.0535711570251973, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 17399 + }, + { + "epoch": 0.174, + "grad_norm": 0.8891907893217653, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 17400 + }, + { + "epoch": 0.17401, + "grad_norm": 0.915005863358091, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 17401 + }, + { + "epoch": 0.17402, + "grad_norm": 1.0196831636719055, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 17402 + }, + { + "epoch": 0.17403, + "grad_norm": 1.1226770599304043, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 17403 + }, + { + "epoch": 0.17404, + "grad_norm": 0.9806517081879206, + "learning_rate": 0.003, + "loss": 4.061, + "step": 17404 + }, + { + "epoch": 0.17405, + "grad_norm": 1.0671346149211505, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 17405 + }, + { + "epoch": 0.17406, + "grad_norm": 0.8848281560108651, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 17406 + }, + { + "epoch": 0.17407, + "grad_norm": 0.8453865737751833, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 17407 + }, + { + "epoch": 0.17408, + "grad_norm": 0.7968385644112044, + "learning_rate": 0.003, + "loss": 4.085, + "step": 17408 + }, + { + "epoch": 0.17409, + "grad_norm": 0.6942131001547551, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 17409 + }, + { + "epoch": 0.1741, + "grad_norm": 0.7163708449335096, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 17410 + }, + { + "epoch": 0.17411, + "grad_norm": 0.6716894864268983, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 17411 + }, + { + "epoch": 0.17412, + "grad_norm": 0.775760487226005, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 17412 + }, + { + "epoch": 0.17413, + "grad_norm": 0.8662987353372363, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 17413 + }, + { + "epoch": 0.17414, + "grad_norm": 0.9356244272211784, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 17414 + }, + { + "epoch": 0.17415, + "grad_norm": 0.8988221550088145, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 17415 + }, + { + "epoch": 0.17416, + "grad_norm": 0.7942533406696778, + "learning_rate": 0.003, + "loss": 4.063, + "step": 17416 + }, + { + "epoch": 0.17417, + "grad_norm": 0.7416019879203941, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 17417 + }, + { + "epoch": 0.17418, + "grad_norm": 0.7069953785734495, + "learning_rate": 0.003, + "loss": 4.045, + "step": 17418 + }, + { + "epoch": 0.17419, + "grad_norm": 0.7428780027332129, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 17419 + }, + { + "epoch": 0.1742, + "grad_norm": 0.8926578190396952, + "learning_rate": 0.003, + "loss": 4.033, + "step": 17420 + }, + { + "epoch": 0.17421, + "grad_norm": 1.0843610623028588, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 17421 + }, + { + "epoch": 0.17422, + "grad_norm": 1.038564518601075, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 17422 + }, + { + "epoch": 0.17423, + "grad_norm": 0.8328262876514982, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 17423 + }, + { + "epoch": 0.17424, + "grad_norm": 0.6821089088106999, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 17424 + }, + { + "epoch": 0.17425, + "grad_norm": 0.6826071597439007, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 17425 + }, + { + "epoch": 0.17426, + "grad_norm": 0.695004814553453, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 17426 + }, + { + "epoch": 0.17427, + "grad_norm": 0.6393247156055374, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 17427 + }, + { + "epoch": 0.17428, + "grad_norm": 0.637871628292266, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 17428 + }, + { + "epoch": 0.17429, + "grad_norm": 0.7777178047163732, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 17429 + }, + { + "epoch": 0.1743, + "grad_norm": 0.8661779560341816, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 17430 + }, + { + "epoch": 0.17431, + "grad_norm": 0.9412312669248412, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 17431 + }, + { + "epoch": 0.17432, + "grad_norm": 1.1525591841199643, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 17432 + }, + { + "epoch": 0.17433, + "grad_norm": 0.9383373213894208, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 17433 + }, + { + "epoch": 0.17434, + "grad_norm": 0.9936019420397292, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 17434 + }, + { + "epoch": 0.17435, + "grad_norm": 0.9421004121289757, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 17435 + }, + { + "epoch": 0.17436, + "grad_norm": 0.8402850560318735, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 17436 + }, + { + "epoch": 0.17437, + "grad_norm": 0.8675813238878146, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 17437 + }, + { + "epoch": 0.17438, + "grad_norm": 0.7833673318869098, + "learning_rate": 0.003, + "loss": 4.071, + "step": 17438 + }, + { + "epoch": 0.17439, + "grad_norm": 0.6890685150324481, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 17439 + }, + { + "epoch": 0.1744, + "grad_norm": 0.8281242550417617, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 17440 + }, + { + "epoch": 0.17441, + "grad_norm": 0.9063872311122002, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 17441 + }, + { + "epoch": 0.17442, + "grad_norm": 1.2388567936666337, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 17442 + }, + { + "epoch": 0.17443, + "grad_norm": 1.0132032163351703, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 17443 + }, + { + "epoch": 0.17444, + "grad_norm": 1.225345906624636, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 17444 + }, + { + "epoch": 0.17445, + "grad_norm": 0.8511032109496721, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 17445 + }, + { + "epoch": 0.17446, + "grad_norm": 0.7447706794121896, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 17446 + }, + { + "epoch": 0.17447, + "grad_norm": 0.7090157341706049, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 17447 + }, + { + "epoch": 0.17448, + "grad_norm": 0.7505360344565417, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 17448 + }, + { + "epoch": 0.17449, + "grad_norm": 0.848405949917275, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 17449 + }, + { + "epoch": 0.1745, + "grad_norm": 0.8858009254230506, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 17450 + }, + { + "epoch": 0.17451, + "grad_norm": 0.8937772087997056, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 17451 + }, + { + "epoch": 0.17452, + "grad_norm": 0.9326565300713741, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 17452 + }, + { + "epoch": 0.17453, + "grad_norm": 1.0995121363945681, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 17453 + }, + { + "epoch": 0.17454, + "grad_norm": 0.9659465972815166, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 17454 + }, + { + "epoch": 0.17455, + "grad_norm": 1.0733968597019738, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 17455 + }, + { + "epoch": 0.17456, + "grad_norm": 0.9250485837570622, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 17456 + }, + { + "epoch": 0.17457, + "grad_norm": 0.8429212214718267, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 17457 + }, + { + "epoch": 0.17458, + "grad_norm": 0.8508206086301354, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 17458 + }, + { + "epoch": 0.17459, + "grad_norm": 0.8916709654437155, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 17459 + }, + { + "epoch": 0.1746, + "grad_norm": 1.1477856385306506, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 17460 + }, + { + "epoch": 0.17461, + "grad_norm": 1.155867703290962, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 17461 + }, + { + "epoch": 0.17462, + "grad_norm": 1.0692742880481143, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 17462 + }, + { + "epoch": 0.17463, + "grad_norm": 0.990337648032987, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 17463 + }, + { + "epoch": 0.17464, + "grad_norm": 0.8268968118405826, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 17464 + }, + { + "epoch": 0.17465, + "grad_norm": 0.7154827931963107, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 17465 + }, + { + "epoch": 0.17466, + "grad_norm": 0.5525510500888201, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 17466 + }, + { + "epoch": 0.17467, + "grad_norm": 0.4735389124705273, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 17467 + }, + { + "epoch": 0.17468, + "grad_norm": 0.49886317222344084, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 17468 + }, + { + "epoch": 0.17469, + "grad_norm": 0.5751635571493581, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 17469 + }, + { + "epoch": 0.1747, + "grad_norm": 0.7130241338777848, + "learning_rate": 0.003, + "loss": 4.077, + "step": 17470 + }, + { + "epoch": 0.17471, + "grad_norm": 0.9869723748518184, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 17471 + }, + { + "epoch": 0.17472, + "grad_norm": 1.2346072474558896, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 17472 + }, + { + "epoch": 0.17473, + "grad_norm": 0.771892335133957, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 17473 + }, + { + "epoch": 0.17474, + "grad_norm": 0.624937231162835, + "learning_rate": 0.003, + "loss": 4.051, + "step": 17474 + }, + { + "epoch": 0.17475, + "grad_norm": 0.628631224984318, + "learning_rate": 0.003, + "loss": 4.038, + "step": 17475 + }, + { + "epoch": 0.17476, + "grad_norm": 0.7573342877762853, + "learning_rate": 0.003, + "loss": 4.066, + "step": 17476 + }, + { + "epoch": 0.17477, + "grad_norm": 0.8432150934639794, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 17477 + }, + { + "epoch": 0.17478, + "grad_norm": 0.8697572738867382, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 17478 + }, + { + "epoch": 0.17479, + "grad_norm": 0.7884222181806019, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 17479 + }, + { + "epoch": 0.1748, + "grad_norm": 0.7342124982656747, + "learning_rate": 0.003, + "loss": 4.06, + "step": 17480 + }, + { + "epoch": 0.17481, + "grad_norm": 0.7582116739894746, + "learning_rate": 0.003, + "loss": 4.039, + "step": 17481 + }, + { + "epoch": 0.17482, + "grad_norm": 0.9322740002099004, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 17482 + }, + { + "epoch": 0.17483, + "grad_norm": 1.1975977579067585, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 17483 + }, + { + "epoch": 0.17484, + "grad_norm": 0.9279283498512066, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 17484 + }, + { + "epoch": 0.17485, + "grad_norm": 0.9773264109216454, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 17485 + }, + { + "epoch": 0.17486, + "grad_norm": 1.1255628443882768, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 17486 + }, + { + "epoch": 0.17487, + "grad_norm": 1.0057673704881918, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 17487 + }, + { + "epoch": 0.17488, + "grad_norm": 0.944213008071853, + "learning_rate": 0.003, + "loss": 4.057, + "step": 17488 + }, + { + "epoch": 0.17489, + "grad_norm": 0.9705320820215475, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 17489 + }, + { + "epoch": 0.1749, + "grad_norm": 1.0360271483695411, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 17490 + }, + { + "epoch": 0.17491, + "grad_norm": 1.1121761504399486, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 17491 + }, + { + "epoch": 0.17492, + "grad_norm": 0.9461535202916413, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 17492 + }, + { + "epoch": 0.17493, + "grad_norm": 0.9638805346180465, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 17493 + }, + { + "epoch": 0.17494, + "grad_norm": 0.9806457833729898, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 17494 + }, + { + "epoch": 0.17495, + "grad_norm": 1.0116307502999224, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 17495 + }, + { + "epoch": 0.17496, + "grad_norm": 1.0115411892178496, + "learning_rate": 0.003, + "loss": 4.053, + "step": 17496 + }, + { + "epoch": 0.17497, + "grad_norm": 1.1104635419976652, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 17497 + }, + { + "epoch": 0.17498, + "grad_norm": 0.9160038775468329, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 17498 + }, + { + "epoch": 0.17499, + "grad_norm": 0.8858753450072356, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 17499 + }, + { + "epoch": 0.175, + "grad_norm": 0.8965716533669236, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 17500 + }, + { + "epoch": 0.17501, + "grad_norm": 0.9137417740197247, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 17501 + }, + { + "epoch": 0.17502, + "grad_norm": 0.8679411526719324, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 17502 + }, + { + "epoch": 0.17503, + "grad_norm": 0.7832916001414321, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 17503 + }, + { + "epoch": 0.17504, + "grad_norm": 0.8647366432455418, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 17504 + }, + { + "epoch": 0.17505, + "grad_norm": 0.924796909872522, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 17505 + }, + { + "epoch": 0.17506, + "grad_norm": 1.0500999561969173, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 17506 + }, + { + "epoch": 0.17507, + "grad_norm": 0.9968136172953102, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 17507 + }, + { + "epoch": 0.17508, + "grad_norm": 1.0441344830453017, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 17508 + }, + { + "epoch": 0.17509, + "grad_norm": 0.9907929715900188, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 17509 + }, + { + "epoch": 0.1751, + "grad_norm": 1.0625363858834336, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 17510 + }, + { + "epoch": 0.17511, + "grad_norm": 0.9310743830942303, + "learning_rate": 0.003, + "loss": 4.061, + "step": 17511 + }, + { + "epoch": 0.17512, + "grad_norm": 0.9430515190103738, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 17512 + }, + { + "epoch": 0.17513, + "grad_norm": 0.946283325212689, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 17513 + }, + { + "epoch": 0.17514, + "grad_norm": 0.9539086214780715, + "learning_rate": 0.003, + "loss": 4.085, + "step": 17514 + }, + { + "epoch": 0.17515, + "grad_norm": 0.9192559681004006, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 17515 + }, + { + "epoch": 0.17516, + "grad_norm": 0.9613466243063407, + "learning_rate": 0.003, + "loss": 4.067, + "step": 17516 + }, + { + "epoch": 0.17517, + "grad_norm": 0.855057663188413, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 17517 + }, + { + "epoch": 0.17518, + "grad_norm": 0.7819737238790523, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 17518 + }, + { + "epoch": 0.17519, + "grad_norm": 0.7894929215269617, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 17519 + }, + { + "epoch": 0.1752, + "grad_norm": 0.7785174026098362, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 17520 + }, + { + "epoch": 0.17521, + "grad_norm": 0.6451900631314694, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 17521 + }, + { + "epoch": 0.17522, + "grad_norm": 0.7348322208828698, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 17522 + }, + { + "epoch": 0.17523, + "grad_norm": 0.787242442960829, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 17523 + }, + { + "epoch": 0.17524, + "grad_norm": 0.8435581819140278, + "learning_rate": 0.003, + "loss": 4.063, + "step": 17524 + }, + { + "epoch": 0.17525, + "grad_norm": 1.0518817907549147, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 17525 + }, + { + "epoch": 0.17526, + "grad_norm": 1.072043433733588, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 17526 + }, + { + "epoch": 0.17527, + "grad_norm": 1.0008141177785768, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 17527 + }, + { + "epoch": 0.17528, + "grad_norm": 0.8761648169689856, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 17528 + }, + { + "epoch": 0.17529, + "grad_norm": 0.808953566953239, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 17529 + }, + { + "epoch": 0.1753, + "grad_norm": 1.0491460634838619, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 17530 + }, + { + "epoch": 0.17531, + "grad_norm": 1.1254342282308454, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 17531 + }, + { + "epoch": 0.17532, + "grad_norm": 0.9106072703230489, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 17532 + }, + { + "epoch": 0.17533, + "grad_norm": 0.8459672349588316, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 17533 + }, + { + "epoch": 0.17534, + "grad_norm": 0.7859129329620995, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 17534 + }, + { + "epoch": 0.17535, + "grad_norm": 0.7164187780365021, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 17535 + }, + { + "epoch": 0.17536, + "grad_norm": 0.6758578745030279, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 17536 + }, + { + "epoch": 0.17537, + "grad_norm": 0.6301918071033128, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 17537 + }, + { + "epoch": 0.17538, + "grad_norm": 0.6946016540411071, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 17538 + }, + { + "epoch": 0.17539, + "grad_norm": 0.9137357060371742, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 17539 + }, + { + "epoch": 0.1754, + "grad_norm": 1.2438535308097833, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 17540 + }, + { + "epoch": 0.17541, + "grad_norm": 0.8563545188060449, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 17541 + }, + { + "epoch": 0.17542, + "grad_norm": 0.585566432087187, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 17542 + }, + { + "epoch": 0.17543, + "grad_norm": 0.6158963241099712, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 17543 + }, + { + "epoch": 0.17544, + "grad_norm": 0.7051359847712503, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 17544 + }, + { + "epoch": 0.17545, + "grad_norm": 0.7906001545104807, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 17545 + }, + { + "epoch": 0.17546, + "grad_norm": 0.8228927302192418, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 17546 + }, + { + "epoch": 0.17547, + "grad_norm": 0.7972687877468351, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 17547 + }, + { + "epoch": 0.17548, + "grad_norm": 0.7823780064342654, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 17548 + }, + { + "epoch": 0.17549, + "grad_norm": 0.7672987644721564, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 17549 + }, + { + "epoch": 0.1755, + "grad_norm": 0.8392083698199194, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 17550 + }, + { + "epoch": 0.17551, + "grad_norm": 0.7915752478764481, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 17551 + }, + { + "epoch": 0.17552, + "grad_norm": 0.7277287945535047, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 17552 + }, + { + "epoch": 0.17553, + "grad_norm": 0.6824068623893177, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 17553 + }, + { + "epoch": 0.17554, + "grad_norm": 0.6826232765114713, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 17554 + }, + { + "epoch": 0.17555, + "grad_norm": 0.7615756928790316, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 17555 + }, + { + "epoch": 0.17556, + "grad_norm": 0.8903101580904664, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 17556 + }, + { + "epoch": 0.17557, + "grad_norm": 1.259822393119585, + "learning_rate": 0.003, + "loss": 4.055, + "step": 17557 + }, + { + "epoch": 0.17558, + "grad_norm": 0.8839182258854181, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 17558 + }, + { + "epoch": 0.17559, + "grad_norm": 0.923864898305606, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 17559 + }, + { + "epoch": 0.1756, + "grad_norm": 0.9402325745136028, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 17560 + }, + { + "epoch": 0.17561, + "grad_norm": 1.1413235289170909, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 17561 + }, + { + "epoch": 0.17562, + "grad_norm": 1.075106099535397, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 17562 + }, + { + "epoch": 0.17563, + "grad_norm": 0.9673705072801845, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 17563 + }, + { + "epoch": 0.17564, + "grad_norm": 1.0466206078148117, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 17564 + }, + { + "epoch": 0.17565, + "grad_norm": 1.0182783625602205, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 17565 + }, + { + "epoch": 0.17566, + "grad_norm": 1.0326103998641878, + "learning_rate": 0.003, + "loss": 4.045, + "step": 17566 + }, + { + "epoch": 0.17567, + "grad_norm": 0.9968830108954899, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 17567 + }, + { + "epoch": 0.17568, + "grad_norm": 0.9007540450812659, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 17568 + }, + { + "epoch": 0.17569, + "grad_norm": 0.8979482803940627, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 17569 + }, + { + "epoch": 0.1757, + "grad_norm": 0.9629958494496546, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 17570 + }, + { + "epoch": 0.17571, + "grad_norm": 1.0521231509896205, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 17571 + }, + { + "epoch": 0.17572, + "grad_norm": 0.9850792220121751, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 17572 + }, + { + "epoch": 0.17573, + "grad_norm": 0.8663062852170624, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 17573 + }, + { + "epoch": 0.17574, + "grad_norm": 0.8124220092050762, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 17574 + }, + { + "epoch": 0.17575, + "grad_norm": 0.8607388898653316, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 17575 + }, + { + "epoch": 0.17576, + "grad_norm": 0.901962162034722, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 17576 + }, + { + "epoch": 0.17577, + "grad_norm": 1.2222671148394237, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 17577 + }, + { + "epoch": 0.17578, + "grad_norm": 1.019216447215782, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 17578 + }, + { + "epoch": 0.17579, + "grad_norm": 0.9295758215508766, + "learning_rate": 0.003, + "loss": 4.074, + "step": 17579 + }, + { + "epoch": 0.1758, + "grad_norm": 0.837650754271571, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 17580 + }, + { + "epoch": 0.17581, + "grad_norm": 0.9027296338426551, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 17581 + }, + { + "epoch": 0.17582, + "grad_norm": 1.0045155133886974, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 17582 + }, + { + "epoch": 0.17583, + "grad_norm": 0.8359872382161975, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 17583 + }, + { + "epoch": 0.17584, + "grad_norm": 0.7548350159816791, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 17584 + }, + { + "epoch": 0.17585, + "grad_norm": 0.7254987194509632, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 17585 + }, + { + "epoch": 0.17586, + "grad_norm": 0.6495839319516847, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 17586 + }, + { + "epoch": 0.17587, + "grad_norm": 0.6828203570598029, + "learning_rate": 0.003, + "loss": 4.071, + "step": 17587 + }, + { + "epoch": 0.17588, + "grad_norm": 0.7517051766671944, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 17588 + }, + { + "epoch": 0.17589, + "grad_norm": 0.7495410651746253, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 17589 + }, + { + "epoch": 0.1759, + "grad_norm": 0.6624188009957733, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 17590 + }, + { + "epoch": 0.17591, + "grad_norm": 0.7281784386661555, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 17591 + }, + { + "epoch": 0.17592, + "grad_norm": 0.8842637341012457, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 17592 + }, + { + "epoch": 0.17593, + "grad_norm": 1.139081154896682, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 17593 + }, + { + "epoch": 0.17594, + "grad_norm": 1.0886166939298445, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 17594 + }, + { + "epoch": 0.17595, + "grad_norm": 0.8427242543047653, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 17595 + }, + { + "epoch": 0.17596, + "grad_norm": 0.8662505373315522, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 17596 + }, + { + "epoch": 0.17597, + "grad_norm": 0.8607201150234504, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 17597 + }, + { + "epoch": 0.17598, + "grad_norm": 0.7840325764956231, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 17598 + }, + { + "epoch": 0.17599, + "grad_norm": 0.7577531649744832, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 17599 + }, + { + "epoch": 0.176, + "grad_norm": 0.6664602135017921, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 17600 + }, + { + "epoch": 0.17601, + "grad_norm": 0.6938148426517162, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 17601 + }, + { + "epoch": 0.17602, + "grad_norm": 0.6539775755754308, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 17602 + }, + { + "epoch": 0.17603, + "grad_norm": 0.6069523128590745, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 17603 + }, + { + "epoch": 0.17604, + "grad_norm": 0.6773916182111083, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 17604 + }, + { + "epoch": 0.17605, + "grad_norm": 1.0012014432715137, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 17605 + }, + { + "epoch": 0.17606, + "grad_norm": 1.4370955588934409, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 17606 + }, + { + "epoch": 0.17607, + "grad_norm": 0.6531011281218092, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 17607 + }, + { + "epoch": 0.17608, + "grad_norm": 0.7464058321070703, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 17608 + }, + { + "epoch": 0.17609, + "grad_norm": 0.7218063876980279, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 17609 + }, + { + "epoch": 0.1761, + "grad_norm": 0.8173621803491683, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 17610 + }, + { + "epoch": 0.17611, + "grad_norm": 0.9427938699169145, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 17611 + }, + { + "epoch": 0.17612, + "grad_norm": 1.0849678597089432, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 17612 + }, + { + "epoch": 0.17613, + "grad_norm": 0.9679820228531881, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 17613 + }, + { + "epoch": 0.17614, + "grad_norm": 0.8665253923119886, + "learning_rate": 0.003, + "loss": 4.034, + "step": 17614 + }, + { + "epoch": 0.17615, + "grad_norm": 0.9042479329654656, + "learning_rate": 0.003, + "loss": 4.037, + "step": 17615 + }, + { + "epoch": 0.17616, + "grad_norm": 0.9616079844836394, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 17616 + }, + { + "epoch": 0.17617, + "grad_norm": 1.0036374450591588, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 17617 + }, + { + "epoch": 0.17618, + "grad_norm": 1.2022344861569625, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 17618 + }, + { + "epoch": 0.17619, + "grad_norm": 0.947988080738896, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 17619 + }, + { + "epoch": 0.1762, + "grad_norm": 0.9869514541150077, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 17620 + }, + { + "epoch": 0.17621, + "grad_norm": 1.1205504433298326, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 17621 + }, + { + "epoch": 0.17622, + "grad_norm": 0.9509252371225612, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 17622 + }, + { + "epoch": 0.17623, + "grad_norm": 0.9135463708548369, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 17623 + }, + { + "epoch": 0.17624, + "grad_norm": 0.9731374729663851, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 17624 + }, + { + "epoch": 0.17625, + "grad_norm": 1.0870998953977447, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 17625 + }, + { + "epoch": 0.17626, + "grad_norm": 0.9541337789351465, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 17626 + }, + { + "epoch": 0.17627, + "grad_norm": 1.0374954049945748, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 17627 + }, + { + "epoch": 0.17628, + "grad_norm": 0.9253188613706079, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 17628 + }, + { + "epoch": 0.17629, + "grad_norm": 0.9231641047895617, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 17629 + }, + { + "epoch": 0.1763, + "grad_norm": 0.7818472035667626, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 17630 + }, + { + "epoch": 0.17631, + "grad_norm": 0.8139538974411653, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 17631 + }, + { + "epoch": 0.17632, + "grad_norm": 0.8106230937634593, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 17632 + }, + { + "epoch": 0.17633, + "grad_norm": 0.6904086027683515, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 17633 + }, + { + "epoch": 0.17634, + "grad_norm": 0.6876779716839564, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 17634 + }, + { + "epoch": 0.17635, + "grad_norm": 0.7604731473197992, + "learning_rate": 0.003, + "loss": 4.05, + "step": 17635 + }, + { + "epoch": 0.17636, + "grad_norm": 0.9055509128125596, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 17636 + }, + { + "epoch": 0.17637, + "grad_norm": 0.827680104335568, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 17637 + }, + { + "epoch": 0.17638, + "grad_norm": 0.6740103740480011, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 17638 + }, + { + "epoch": 0.17639, + "grad_norm": 0.6649361019173743, + "learning_rate": 0.003, + "loss": 4.06, + "step": 17639 + }, + { + "epoch": 0.1764, + "grad_norm": 0.8923943342551018, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 17640 + }, + { + "epoch": 0.17641, + "grad_norm": 1.2564295114377186, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 17641 + }, + { + "epoch": 0.17642, + "grad_norm": 1.116295566826262, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 17642 + }, + { + "epoch": 0.17643, + "grad_norm": 0.8359441106024887, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 17643 + }, + { + "epoch": 0.17644, + "grad_norm": 0.7950901805727237, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 17644 + }, + { + "epoch": 0.17645, + "grad_norm": 0.7428549364789354, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 17645 + }, + { + "epoch": 0.17646, + "grad_norm": 0.6895843602383841, + "learning_rate": 0.003, + "loss": 4.045, + "step": 17646 + }, + { + "epoch": 0.17647, + "grad_norm": 0.7174011282035472, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 17647 + }, + { + "epoch": 0.17648, + "grad_norm": 0.7753711222624593, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 17648 + }, + { + "epoch": 0.17649, + "grad_norm": 0.8521666068533342, + "learning_rate": 0.003, + "loss": 4.041, + "step": 17649 + }, + { + "epoch": 0.1765, + "grad_norm": 1.0663392302108505, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 17650 + }, + { + "epoch": 0.17651, + "grad_norm": 1.0774798343451986, + "learning_rate": 0.003, + "loss": 4.063, + "step": 17651 + }, + { + "epoch": 0.17652, + "grad_norm": 1.0407904728304735, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 17652 + }, + { + "epoch": 0.17653, + "grad_norm": 0.8356324203494921, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 17653 + }, + { + "epoch": 0.17654, + "grad_norm": 0.6126536594835358, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 17654 + }, + { + "epoch": 0.17655, + "grad_norm": 0.6086643446139258, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 17655 + }, + { + "epoch": 0.17656, + "grad_norm": 0.6849985521799608, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 17656 + }, + { + "epoch": 0.17657, + "grad_norm": 0.7664003335440541, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 17657 + }, + { + "epoch": 0.17658, + "grad_norm": 0.9162724064067518, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 17658 + }, + { + "epoch": 0.17659, + "grad_norm": 1.1762867318446273, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 17659 + }, + { + "epoch": 0.1766, + "grad_norm": 1.171536516985786, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 17660 + }, + { + "epoch": 0.17661, + "grad_norm": 0.8788337587723518, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 17661 + }, + { + "epoch": 0.17662, + "grad_norm": 0.8182153037944079, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 17662 + }, + { + "epoch": 0.17663, + "grad_norm": 0.7129332044919899, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 17663 + }, + { + "epoch": 0.17664, + "grad_norm": 0.7108767454704167, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 17664 + }, + { + "epoch": 0.17665, + "grad_norm": 0.7158462859575461, + "learning_rate": 0.003, + "loss": 4.067, + "step": 17665 + }, + { + "epoch": 0.17666, + "grad_norm": 0.8142065368066004, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 17666 + }, + { + "epoch": 0.17667, + "grad_norm": 1.1001353519020953, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 17667 + }, + { + "epoch": 0.17668, + "grad_norm": 1.125336827208438, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 17668 + }, + { + "epoch": 0.17669, + "grad_norm": 0.9262014892175805, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 17669 + }, + { + "epoch": 0.1767, + "grad_norm": 0.9351966265618215, + "learning_rate": 0.003, + "loss": 4.05, + "step": 17670 + }, + { + "epoch": 0.17671, + "grad_norm": 1.0079906903639217, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 17671 + }, + { + "epoch": 0.17672, + "grad_norm": 1.1162098820290083, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 17672 + }, + { + "epoch": 0.17673, + "grad_norm": 0.8669269208345238, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 17673 + }, + { + "epoch": 0.17674, + "grad_norm": 0.8963895136624581, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 17674 + }, + { + "epoch": 0.17675, + "grad_norm": 0.9956867098591692, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 17675 + }, + { + "epoch": 0.17676, + "grad_norm": 1.0116655226855777, + "learning_rate": 0.003, + "loss": 4.077, + "step": 17676 + }, + { + "epoch": 0.17677, + "grad_norm": 0.9397732140747828, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 17677 + }, + { + "epoch": 0.17678, + "grad_norm": 0.9501246526436264, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 17678 + }, + { + "epoch": 0.17679, + "grad_norm": 0.907004381747051, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 17679 + }, + { + "epoch": 0.1768, + "grad_norm": 0.9668483402391215, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 17680 + }, + { + "epoch": 0.17681, + "grad_norm": 1.1724624604996552, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 17681 + }, + { + "epoch": 0.17682, + "grad_norm": 0.9348259883600136, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 17682 + }, + { + "epoch": 0.17683, + "grad_norm": 0.9512258847754369, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 17683 + }, + { + "epoch": 0.17684, + "grad_norm": 0.9823492002675904, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 17684 + }, + { + "epoch": 0.17685, + "grad_norm": 0.9649539217405146, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 17685 + }, + { + "epoch": 0.17686, + "grad_norm": 0.9478093402862565, + "learning_rate": 0.003, + "loss": 4.081, + "step": 17686 + }, + { + "epoch": 0.17687, + "grad_norm": 1.0166366580765052, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 17687 + }, + { + "epoch": 0.17688, + "grad_norm": 1.0379627024703006, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 17688 + }, + { + "epoch": 0.17689, + "grad_norm": 1.0749147037028024, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 17689 + }, + { + "epoch": 0.1769, + "grad_norm": 0.9386869748228578, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 17690 + }, + { + "epoch": 0.17691, + "grad_norm": 1.0263503233986766, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 17691 + }, + { + "epoch": 0.17692, + "grad_norm": 1.2009848361111113, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 17692 + }, + { + "epoch": 0.17693, + "grad_norm": 0.9468106159021873, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 17693 + }, + { + "epoch": 0.17694, + "grad_norm": 0.7928861683500125, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 17694 + }, + { + "epoch": 0.17695, + "grad_norm": 0.713073859342127, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 17695 + }, + { + "epoch": 0.17696, + "grad_norm": 0.5780474132367384, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 17696 + }, + { + "epoch": 0.17697, + "grad_norm": 0.4888099583148028, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 17697 + }, + { + "epoch": 0.17698, + "grad_norm": 0.5240675538308072, + "learning_rate": 0.003, + "loss": 4.062, + "step": 17698 + }, + { + "epoch": 0.17699, + "grad_norm": 0.5808100200357558, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 17699 + }, + { + "epoch": 0.177, + "grad_norm": 0.652105814167825, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 17700 + }, + { + "epoch": 0.17701, + "grad_norm": 0.6724715074058332, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 17701 + }, + { + "epoch": 0.17702, + "grad_norm": 0.6273551136784689, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 17702 + }, + { + "epoch": 0.17703, + "grad_norm": 0.7188470214720498, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 17703 + }, + { + "epoch": 0.17704, + "grad_norm": 0.9196777006848662, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 17704 + }, + { + "epoch": 0.17705, + "grad_norm": 1.233604016695256, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 17705 + }, + { + "epoch": 0.17706, + "grad_norm": 0.8099758682921231, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 17706 + }, + { + "epoch": 0.17707, + "grad_norm": 0.7922889457768697, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 17707 + }, + { + "epoch": 0.17708, + "grad_norm": 0.9294910034701283, + "learning_rate": 0.003, + "loss": 4.051, + "step": 17708 + }, + { + "epoch": 0.17709, + "grad_norm": 0.9634719340790691, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 17709 + }, + { + "epoch": 0.1771, + "grad_norm": 0.8542883355439383, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 17710 + }, + { + "epoch": 0.17711, + "grad_norm": 1.008147096757313, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 17711 + }, + { + "epoch": 0.17712, + "grad_norm": 0.9995776782234835, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 17712 + }, + { + "epoch": 0.17713, + "grad_norm": 1.035762142246252, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 17713 + }, + { + "epoch": 0.17714, + "grad_norm": 1.0812204223566275, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 17714 + }, + { + "epoch": 0.17715, + "grad_norm": 1.0160003851823913, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 17715 + }, + { + "epoch": 0.17716, + "grad_norm": 0.9901224019194541, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 17716 + }, + { + "epoch": 0.17717, + "grad_norm": 1.0105523828866692, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 17717 + }, + { + "epoch": 0.17718, + "grad_norm": 0.8803703923794938, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 17718 + }, + { + "epoch": 0.17719, + "grad_norm": 0.8475179522916448, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 17719 + }, + { + "epoch": 0.1772, + "grad_norm": 0.8505723124785796, + "learning_rate": 0.003, + "loss": 4.052, + "step": 17720 + }, + { + "epoch": 0.17721, + "grad_norm": 0.8355218228494463, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 17721 + }, + { + "epoch": 0.17722, + "grad_norm": 0.7703889576636893, + "learning_rate": 0.003, + "loss": 4.036, + "step": 17722 + }, + { + "epoch": 0.17723, + "grad_norm": 0.8495813991491132, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 17723 + }, + { + "epoch": 0.17724, + "grad_norm": 0.9134821544671048, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 17724 + }, + { + "epoch": 0.17725, + "grad_norm": 0.8144739276381116, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 17725 + }, + { + "epoch": 0.17726, + "grad_norm": 0.6939416908790277, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 17726 + }, + { + "epoch": 0.17727, + "grad_norm": 0.6454072470607094, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 17727 + }, + { + "epoch": 0.17728, + "grad_norm": 0.7014165435036105, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 17728 + }, + { + "epoch": 0.17729, + "grad_norm": 0.7311048415725621, + "learning_rate": 0.003, + "loss": 4.06, + "step": 17729 + }, + { + "epoch": 0.1773, + "grad_norm": 0.8872622358163939, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 17730 + }, + { + "epoch": 0.17731, + "grad_norm": 1.2137089571607587, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 17731 + }, + { + "epoch": 0.17732, + "grad_norm": 0.9294622943031654, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 17732 + }, + { + "epoch": 0.17733, + "grad_norm": 0.7542301139801861, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 17733 + }, + { + "epoch": 0.17734, + "grad_norm": 0.6185305422801383, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 17734 + }, + { + "epoch": 0.17735, + "grad_norm": 0.6137864649173381, + "learning_rate": 0.003, + "loss": 4.052, + "step": 17735 + }, + { + "epoch": 0.17736, + "grad_norm": 0.6659455575828724, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 17736 + }, + { + "epoch": 0.17737, + "grad_norm": 0.8345702722822673, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 17737 + }, + { + "epoch": 0.17738, + "grad_norm": 0.9022353673728907, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 17738 + }, + { + "epoch": 0.17739, + "grad_norm": 0.8191784473938587, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 17739 + }, + { + "epoch": 0.1774, + "grad_norm": 0.8883865655499635, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 17740 + }, + { + "epoch": 0.17741, + "grad_norm": 0.9811315410758518, + "learning_rate": 0.003, + "loss": 4.07, + "step": 17741 + }, + { + "epoch": 0.17742, + "grad_norm": 1.2013511256150107, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 17742 + }, + { + "epoch": 0.17743, + "grad_norm": 1.092672255113907, + "learning_rate": 0.003, + "loss": 4.106, + "step": 17743 + }, + { + "epoch": 0.17744, + "grad_norm": 0.9219106015146562, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 17744 + }, + { + "epoch": 0.17745, + "grad_norm": 0.812974922409387, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 17745 + }, + { + "epoch": 0.17746, + "grad_norm": 0.8316941146679137, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 17746 + }, + { + "epoch": 0.17747, + "grad_norm": 0.9060983849781237, + "learning_rate": 0.003, + "loss": 4.065, + "step": 17747 + }, + { + "epoch": 0.17748, + "grad_norm": 0.889852248776499, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 17748 + }, + { + "epoch": 0.17749, + "grad_norm": 0.7936664451323253, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 17749 + }, + { + "epoch": 0.1775, + "grad_norm": 0.7516260574375117, + "learning_rate": 0.003, + "loss": 4.061, + "step": 17750 + }, + { + "epoch": 0.17751, + "grad_norm": 0.6923171146385486, + "learning_rate": 0.003, + "loss": 4.079, + "step": 17751 + }, + { + "epoch": 0.17752, + "grad_norm": 0.6154399217859506, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 17752 + }, + { + "epoch": 0.17753, + "grad_norm": 0.5548021200243216, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 17753 + }, + { + "epoch": 0.17754, + "grad_norm": 0.6382749059286056, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 17754 + }, + { + "epoch": 0.17755, + "grad_norm": 0.6703350674066354, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 17755 + }, + { + "epoch": 0.17756, + "grad_norm": 0.7765406248457878, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 17756 + }, + { + "epoch": 0.17757, + "grad_norm": 0.9874011405256912, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 17757 + }, + { + "epoch": 0.17758, + "grad_norm": 1.1864614021848556, + "learning_rate": 0.003, + "loss": 4.081, + "step": 17758 + }, + { + "epoch": 0.17759, + "grad_norm": 0.7966573306432746, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 17759 + }, + { + "epoch": 0.1776, + "grad_norm": 0.8550868859423417, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 17760 + }, + { + "epoch": 0.17761, + "grad_norm": 0.8653694868982882, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 17761 + }, + { + "epoch": 0.17762, + "grad_norm": 1.13111863908971, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 17762 + }, + { + "epoch": 0.17763, + "grad_norm": 1.369890814521613, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 17763 + }, + { + "epoch": 0.17764, + "grad_norm": 0.9022243922006958, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 17764 + }, + { + "epoch": 0.17765, + "grad_norm": 1.0692650764857046, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 17765 + }, + { + "epoch": 0.17766, + "grad_norm": 1.047074446704455, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 17766 + }, + { + "epoch": 0.17767, + "grad_norm": 0.9448472426070241, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 17767 + }, + { + "epoch": 0.17768, + "grad_norm": 0.965255522704598, + "learning_rate": 0.003, + "loss": 4.051, + "step": 17768 + }, + { + "epoch": 0.17769, + "grad_norm": 0.9941641729165432, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 17769 + }, + { + "epoch": 0.1777, + "grad_norm": 0.9188787222419813, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 17770 + }, + { + "epoch": 0.17771, + "grad_norm": 0.7719858937803888, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 17771 + }, + { + "epoch": 0.17772, + "grad_norm": 0.8321469503134653, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 17772 + }, + { + "epoch": 0.17773, + "grad_norm": 0.8980540599473181, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 17773 + }, + { + "epoch": 0.17774, + "grad_norm": 1.1251410166996434, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 17774 + }, + { + "epoch": 0.17775, + "grad_norm": 0.8668630414936879, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 17775 + }, + { + "epoch": 0.17776, + "grad_norm": 0.7228707330959857, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 17776 + }, + { + "epoch": 0.17777, + "grad_norm": 0.7450778998710901, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 17777 + }, + { + "epoch": 0.17778, + "grad_norm": 0.6943377392989274, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 17778 + }, + { + "epoch": 0.17779, + "grad_norm": 0.7187853790848213, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 17779 + }, + { + "epoch": 0.1778, + "grad_norm": 0.7349671084053078, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 17780 + }, + { + "epoch": 0.17781, + "grad_norm": 0.711743364709573, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 17781 + }, + { + "epoch": 0.17782, + "grad_norm": 0.8813112801703069, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 17782 + }, + { + "epoch": 0.17783, + "grad_norm": 1.0788721319362986, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 17783 + }, + { + "epoch": 0.17784, + "grad_norm": 0.9905193408532994, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 17784 + }, + { + "epoch": 0.17785, + "grad_norm": 1.0033328942452602, + "learning_rate": 0.003, + "loss": 4.045, + "step": 17785 + }, + { + "epoch": 0.17786, + "grad_norm": 1.1187785421076482, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 17786 + }, + { + "epoch": 0.17787, + "grad_norm": 1.0117980388825438, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 17787 + }, + { + "epoch": 0.17788, + "grad_norm": 0.9705070010981849, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 17788 + }, + { + "epoch": 0.17789, + "grad_norm": 0.7926296671649389, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 17789 + }, + { + "epoch": 0.1779, + "grad_norm": 0.8256372867819184, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 17790 + }, + { + "epoch": 0.17791, + "grad_norm": 1.0366312697839044, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 17791 + }, + { + "epoch": 0.17792, + "grad_norm": 1.0290880745328101, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 17792 + }, + { + "epoch": 0.17793, + "grad_norm": 0.9556266395700164, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 17793 + }, + { + "epoch": 0.17794, + "grad_norm": 1.0961147248935896, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 17794 + }, + { + "epoch": 0.17795, + "grad_norm": 1.2133294082716444, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 17795 + }, + { + "epoch": 0.17796, + "grad_norm": 1.0670244159468336, + "learning_rate": 0.003, + "loss": 4.053, + "step": 17796 + }, + { + "epoch": 0.17797, + "grad_norm": 0.9168176835378857, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 17797 + }, + { + "epoch": 0.17798, + "grad_norm": 0.8872025151479986, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 17798 + }, + { + "epoch": 0.17799, + "grad_norm": 0.8922375868449132, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 17799 + }, + { + "epoch": 0.178, + "grad_norm": 0.9132471119186745, + "learning_rate": 0.003, + "loss": 4.075, + "step": 17800 + }, + { + "epoch": 0.17801, + "grad_norm": 1.012904222596664, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 17801 + }, + { + "epoch": 0.17802, + "grad_norm": 0.9200202318398326, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 17802 + }, + { + "epoch": 0.17803, + "grad_norm": 0.9801841750868228, + "learning_rate": 0.003, + "loss": 4.081, + "step": 17803 + }, + { + "epoch": 0.17804, + "grad_norm": 1.1263800660087637, + "learning_rate": 0.003, + "loss": 4.091, + "step": 17804 + }, + { + "epoch": 0.17805, + "grad_norm": 0.9465104112523818, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 17805 + }, + { + "epoch": 0.17806, + "grad_norm": 0.7347905293908259, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 17806 + }, + { + "epoch": 0.17807, + "grad_norm": 0.6378036178893862, + "learning_rate": 0.003, + "loss": 4.055, + "step": 17807 + }, + { + "epoch": 0.17808, + "grad_norm": 0.6231379764645049, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 17808 + }, + { + "epoch": 0.17809, + "grad_norm": 0.574030254558826, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 17809 + }, + { + "epoch": 0.1781, + "grad_norm": 0.6262467973941603, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 17810 + }, + { + "epoch": 0.17811, + "grad_norm": 0.6503924772853084, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 17811 + }, + { + "epoch": 0.17812, + "grad_norm": 0.6428411925342502, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 17812 + }, + { + "epoch": 0.17813, + "grad_norm": 0.7227626352575178, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 17813 + }, + { + "epoch": 0.17814, + "grad_norm": 0.9585038957744706, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 17814 + }, + { + "epoch": 0.17815, + "grad_norm": 1.136555587105889, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 17815 + }, + { + "epoch": 0.17816, + "grad_norm": 0.9097401319619741, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 17816 + }, + { + "epoch": 0.17817, + "grad_norm": 0.9773536311525911, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 17817 + }, + { + "epoch": 0.17818, + "grad_norm": 1.0389111413400334, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 17818 + }, + { + "epoch": 0.17819, + "grad_norm": 1.1815291234110734, + "learning_rate": 0.003, + "loss": 4.087, + "step": 17819 + }, + { + "epoch": 0.1782, + "grad_norm": 0.91207429700373, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 17820 + }, + { + "epoch": 0.17821, + "grad_norm": 0.7860232391886783, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 17821 + }, + { + "epoch": 0.17822, + "grad_norm": 0.7035842037145283, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 17822 + }, + { + "epoch": 0.17823, + "grad_norm": 0.6840969647439207, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 17823 + }, + { + "epoch": 0.17824, + "grad_norm": 0.6217284845722004, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 17824 + }, + { + "epoch": 0.17825, + "grad_norm": 0.6704324314407434, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 17825 + }, + { + "epoch": 0.17826, + "grad_norm": 0.8147753576290412, + "learning_rate": 0.003, + "loss": 4.062, + "step": 17826 + }, + { + "epoch": 0.17827, + "grad_norm": 1.0191376389186433, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 17827 + }, + { + "epoch": 0.17828, + "grad_norm": 1.0841394308763148, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 17828 + }, + { + "epoch": 0.17829, + "grad_norm": 0.8344426691123225, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 17829 + }, + { + "epoch": 0.1783, + "grad_norm": 0.7544609278945689, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 17830 + }, + { + "epoch": 0.17831, + "grad_norm": 0.703177202987652, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 17831 + }, + { + "epoch": 0.17832, + "grad_norm": 0.8088154018493255, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 17832 + }, + { + "epoch": 0.17833, + "grad_norm": 0.846507696887801, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 17833 + }, + { + "epoch": 0.17834, + "grad_norm": 0.9479917459460131, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 17834 + }, + { + "epoch": 0.17835, + "grad_norm": 1.0166749442753464, + "learning_rate": 0.003, + "loss": 4.075, + "step": 17835 + }, + { + "epoch": 0.17836, + "grad_norm": 0.9724725426647024, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 17836 + }, + { + "epoch": 0.17837, + "grad_norm": 0.9442101310692572, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 17837 + }, + { + "epoch": 0.17838, + "grad_norm": 0.8850063830339444, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 17838 + }, + { + "epoch": 0.17839, + "grad_norm": 0.9788694520353807, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 17839 + }, + { + "epoch": 0.1784, + "grad_norm": 1.0457320811640962, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 17840 + }, + { + "epoch": 0.17841, + "grad_norm": 0.8789040112559586, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 17841 + }, + { + "epoch": 0.17842, + "grad_norm": 0.9464363365637045, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 17842 + }, + { + "epoch": 0.17843, + "grad_norm": 1.029546704797995, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 17843 + }, + { + "epoch": 0.17844, + "grad_norm": 1.1172100045966578, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 17844 + }, + { + "epoch": 0.17845, + "grad_norm": 1.0220370572765092, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 17845 + }, + { + "epoch": 0.17846, + "grad_norm": 0.9969638416217529, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 17846 + }, + { + "epoch": 0.17847, + "grad_norm": 0.988076425895647, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 17847 + }, + { + "epoch": 0.17848, + "grad_norm": 0.8811541259749696, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 17848 + }, + { + "epoch": 0.17849, + "grad_norm": 0.8635706958060115, + "learning_rate": 0.003, + "loss": 4.096, + "step": 17849 + }, + { + "epoch": 0.1785, + "grad_norm": 0.892420935401702, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 17850 + }, + { + "epoch": 0.17851, + "grad_norm": 1.016620971248911, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 17851 + }, + { + "epoch": 0.17852, + "grad_norm": 1.146981148203576, + "learning_rate": 0.003, + "loss": 4.084, + "step": 17852 + }, + { + "epoch": 0.17853, + "grad_norm": 0.9661364880061537, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 17853 + }, + { + "epoch": 0.17854, + "grad_norm": 1.030066776504963, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 17854 + }, + { + "epoch": 0.17855, + "grad_norm": 1.0163877980167015, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 17855 + }, + { + "epoch": 0.17856, + "grad_norm": 1.0598006866318361, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 17856 + }, + { + "epoch": 0.17857, + "grad_norm": 1.11541742937424, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 17857 + }, + { + "epoch": 0.17858, + "grad_norm": 0.8217026406075862, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 17858 + }, + { + "epoch": 0.17859, + "grad_norm": 0.7526103482689931, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 17859 + }, + { + "epoch": 0.1786, + "grad_norm": 0.8674666582553064, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 17860 + }, + { + "epoch": 0.17861, + "grad_norm": 1.0235191697043038, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 17861 + }, + { + "epoch": 0.17862, + "grad_norm": 1.113427436262814, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 17862 + }, + { + "epoch": 0.17863, + "grad_norm": 0.9294745221925278, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 17863 + }, + { + "epoch": 0.17864, + "grad_norm": 0.8695417549804368, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 17864 + }, + { + "epoch": 0.17865, + "grad_norm": 0.8222361364773434, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 17865 + }, + { + "epoch": 0.17866, + "grad_norm": 0.7582328748735376, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 17866 + }, + { + "epoch": 0.17867, + "grad_norm": 0.8418979652681704, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 17867 + }, + { + "epoch": 0.17868, + "grad_norm": 0.823726393265443, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 17868 + }, + { + "epoch": 0.17869, + "grad_norm": 0.8347670839103752, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 17869 + }, + { + "epoch": 0.1787, + "grad_norm": 0.8844912797792485, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 17870 + }, + { + "epoch": 0.17871, + "grad_norm": 1.0457242349054428, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 17871 + }, + { + "epoch": 0.17872, + "grad_norm": 0.9981604675465449, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 17872 + }, + { + "epoch": 0.17873, + "grad_norm": 0.9000260217232073, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 17873 + }, + { + "epoch": 0.17874, + "grad_norm": 0.8210044898990569, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 17874 + }, + { + "epoch": 0.17875, + "grad_norm": 0.7098912171295579, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 17875 + }, + { + "epoch": 0.17876, + "grad_norm": 0.6314395218838892, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 17876 + }, + { + "epoch": 0.17877, + "grad_norm": 0.7458507738392257, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 17877 + }, + { + "epoch": 0.17878, + "grad_norm": 0.845141126710546, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 17878 + }, + { + "epoch": 0.17879, + "grad_norm": 1.0997008575780136, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 17879 + }, + { + "epoch": 0.1788, + "grad_norm": 1.0677830801770296, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 17880 + }, + { + "epoch": 0.17881, + "grad_norm": 0.8967100099923909, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 17881 + }, + { + "epoch": 0.17882, + "grad_norm": 0.8116601445634104, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 17882 + }, + { + "epoch": 0.17883, + "grad_norm": 0.7008666203406305, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 17883 + }, + { + "epoch": 0.17884, + "grad_norm": 0.7504508937434677, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 17884 + }, + { + "epoch": 0.17885, + "grad_norm": 0.7932994676003893, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 17885 + }, + { + "epoch": 0.17886, + "grad_norm": 0.9830876472836074, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 17886 + }, + { + "epoch": 0.17887, + "grad_norm": 1.1207446961132943, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 17887 + }, + { + "epoch": 0.17888, + "grad_norm": 0.8747089539131329, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 17888 + }, + { + "epoch": 0.17889, + "grad_norm": 0.918664755130003, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 17889 + }, + { + "epoch": 0.1789, + "grad_norm": 0.9903103179316689, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 17890 + }, + { + "epoch": 0.17891, + "grad_norm": 0.9649525416433561, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 17891 + }, + { + "epoch": 0.17892, + "grad_norm": 0.9506342680304857, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 17892 + }, + { + "epoch": 0.17893, + "grad_norm": 0.9434584248212771, + "learning_rate": 0.003, + "loss": 4.053, + "step": 17893 + }, + { + "epoch": 0.17894, + "grad_norm": 1.1081695893566654, + "learning_rate": 0.003, + "loss": 4.072, + "step": 17894 + }, + { + "epoch": 0.17895, + "grad_norm": 0.8594611751215389, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 17895 + }, + { + "epoch": 0.17896, + "grad_norm": 0.870788515369203, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 17896 + }, + { + "epoch": 0.17897, + "grad_norm": 0.893168486716251, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 17897 + }, + { + "epoch": 0.17898, + "grad_norm": 0.8393792542325452, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 17898 + }, + { + "epoch": 0.17899, + "grad_norm": 0.8016239868093878, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 17899 + }, + { + "epoch": 0.179, + "grad_norm": 0.7366597111961479, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 17900 + }, + { + "epoch": 0.17901, + "grad_norm": 0.6606758482109569, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 17901 + }, + { + "epoch": 0.17902, + "grad_norm": 0.7029842079254771, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 17902 + }, + { + "epoch": 0.17903, + "grad_norm": 0.9665779376585047, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 17903 + }, + { + "epoch": 0.17904, + "grad_norm": 1.434524948923059, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 17904 + }, + { + "epoch": 0.17905, + "grad_norm": 0.6099934258825926, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 17905 + }, + { + "epoch": 0.17906, + "grad_norm": 0.7921307035709443, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 17906 + }, + { + "epoch": 0.17907, + "grad_norm": 0.8637347471785389, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 17907 + }, + { + "epoch": 0.17908, + "grad_norm": 0.7657210128835413, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 17908 + }, + { + "epoch": 0.17909, + "grad_norm": 0.7102705729311167, + "learning_rate": 0.003, + "loss": 4.083, + "step": 17909 + }, + { + "epoch": 0.1791, + "grad_norm": 0.7722022559628559, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 17910 + }, + { + "epoch": 0.17911, + "grad_norm": 0.7027911251903038, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 17911 + }, + { + "epoch": 0.17912, + "grad_norm": 0.7761335176024163, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 17912 + }, + { + "epoch": 0.17913, + "grad_norm": 0.9494240497237516, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 17913 + }, + { + "epoch": 0.17914, + "grad_norm": 1.1110165029892805, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 17914 + }, + { + "epoch": 0.17915, + "grad_norm": 0.9374310699867275, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 17915 + }, + { + "epoch": 0.17916, + "grad_norm": 0.8897764269231634, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 17916 + }, + { + "epoch": 0.17917, + "grad_norm": 0.7992151892069709, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 17917 + }, + { + "epoch": 0.17918, + "grad_norm": 0.8797485344863202, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 17918 + }, + { + "epoch": 0.17919, + "grad_norm": 0.8672758974774529, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 17919 + }, + { + "epoch": 0.1792, + "grad_norm": 0.8062273152118529, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 17920 + }, + { + "epoch": 0.17921, + "grad_norm": 0.7614024663606663, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 17921 + }, + { + "epoch": 0.17922, + "grad_norm": 0.8741360617038456, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 17922 + }, + { + "epoch": 0.17923, + "grad_norm": 1.0141438743501523, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 17923 + }, + { + "epoch": 0.17924, + "grad_norm": 0.8478569345318532, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 17924 + }, + { + "epoch": 0.17925, + "grad_norm": 0.7520471566488796, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 17925 + }, + { + "epoch": 0.17926, + "grad_norm": 0.6978604676792622, + "learning_rate": 0.003, + "loss": 4.037, + "step": 17926 + }, + { + "epoch": 0.17927, + "grad_norm": 0.8497823407207831, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 17927 + }, + { + "epoch": 0.17928, + "grad_norm": 1.1737409490970645, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 17928 + }, + { + "epoch": 0.17929, + "grad_norm": 0.9216083350139652, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 17929 + }, + { + "epoch": 0.1793, + "grad_norm": 0.9540698123430895, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 17930 + }, + { + "epoch": 0.17931, + "grad_norm": 1.180372591646462, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 17931 + }, + { + "epoch": 0.17932, + "grad_norm": 0.8802199886666183, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 17932 + }, + { + "epoch": 0.17933, + "grad_norm": 0.8734116782838467, + "learning_rate": 0.003, + "loss": 4.049, + "step": 17933 + }, + { + "epoch": 0.17934, + "grad_norm": 0.7855352141197343, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 17934 + }, + { + "epoch": 0.17935, + "grad_norm": 0.8975687543207646, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 17935 + }, + { + "epoch": 0.17936, + "grad_norm": 1.1101120089195442, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 17936 + }, + { + "epoch": 0.17937, + "grad_norm": 1.0529926461257948, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 17937 + }, + { + "epoch": 0.17938, + "grad_norm": 0.9307149829814615, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 17938 + }, + { + "epoch": 0.17939, + "grad_norm": 0.9991407278947849, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 17939 + }, + { + "epoch": 0.1794, + "grad_norm": 1.0413405119071344, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 17940 + }, + { + "epoch": 0.17941, + "grad_norm": 1.1522582155284176, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 17941 + }, + { + "epoch": 0.17942, + "grad_norm": 1.0635877004853507, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 17942 + }, + { + "epoch": 0.17943, + "grad_norm": 0.8479492847157323, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 17943 + }, + { + "epoch": 0.17944, + "grad_norm": 0.8325614379958157, + "learning_rate": 0.003, + "loss": 4.081, + "step": 17944 + }, + { + "epoch": 0.17945, + "grad_norm": 0.8936956994628485, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 17945 + }, + { + "epoch": 0.17946, + "grad_norm": 0.8655358709774501, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 17946 + }, + { + "epoch": 0.17947, + "grad_norm": 0.8450066761147643, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 17947 + }, + { + "epoch": 0.17948, + "grad_norm": 0.8266749042578516, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 17948 + }, + { + "epoch": 0.17949, + "grad_norm": 0.8730306356348632, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 17949 + }, + { + "epoch": 0.1795, + "grad_norm": 1.1400733758214807, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 17950 + }, + { + "epoch": 0.17951, + "grad_norm": 1.1017829514672806, + "learning_rate": 0.003, + "loss": 4.063, + "step": 17951 + }, + { + "epoch": 0.17952, + "grad_norm": 0.9347560947573004, + "learning_rate": 0.003, + "loss": 4.04, + "step": 17952 + }, + { + "epoch": 0.17953, + "grad_norm": 0.9440296701440463, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 17953 + }, + { + "epoch": 0.17954, + "grad_norm": 0.8903371073408782, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 17954 + }, + { + "epoch": 0.17955, + "grad_norm": 0.8666115680666635, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 17955 + }, + { + "epoch": 0.17956, + "grad_norm": 0.8914096643592457, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 17956 + }, + { + "epoch": 0.17957, + "grad_norm": 1.0763191811784587, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 17957 + }, + { + "epoch": 0.17958, + "grad_norm": 0.8774000849886722, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 17958 + }, + { + "epoch": 0.17959, + "grad_norm": 0.8562373317838089, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 17959 + }, + { + "epoch": 0.1796, + "grad_norm": 0.8505421766506722, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 17960 + }, + { + "epoch": 0.17961, + "grad_norm": 0.7987535692134098, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 17961 + }, + { + "epoch": 0.17962, + "grad_norm": 0.8460412070139576, + "learning_rate": 0.003, + "loss": 4.043, + "step": 17962 + }, + { + "epoch": 0.17963, + "grad_norm": 0.9366779769009153, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 17963 + }, + { + "epoch": 0.17964, + "grad_norm": 1.1279418516043072, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 17964 + }, + { + "epoch": 0.17965, + "grad_norm": 0.8691996478669544, + "learning_rate": 0.003, + "loss": 4.048, + "step": 17965 + }, + { + "epoch": 0.17966, + "grad_norm": 0.7984813154248974, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 17966 + }, + { + "epoch": 0.17967, + "grad_norm": 0.784405067697677, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 17967 + }, + { + "epoch": 0.17968, + "grad_norm": 0.7624571907998321, + "learning_rate": 0.003, + "loss": 4.056, + "step": 17968 + }, + { + "epoch": 0.17969, + "grad_norm": 0.7934868979355254, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 17969 + }, + { + "epoch": 0.1797, + "grad_norm": 0.8584516317708869, + "learning_rate": 0.003, + "loss": 4.058, + "step": 17970 + }, + { + "epoch": 0.17971, + "grad_norm": 0.8440505039962745, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 17971 + }, + { + "epoch": 0.17972, + "grad_norm": 0.7911630625598691, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 17972 + }, + { + "epoch": 0.17973, + "grad_norm": 0.7811077834896014, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 17973 + }, + { + "epoch": 0.17974, + "grad_norm": 0.8446794202047522, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 17974 + }, + { + "epoch": 0.17975, + "grad_norm": 0.8254521903148109, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 17975 + }, + { + "epoch": 0.17976, + "grad_norm": 0.728235183395874, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 17976 + }, + { + "epoch": 0.17977, + "grad_norm": 0.7225241440192826, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 17977 + }, + { + "epoch": 0.17978, + "grad_norm": 0.7427535082841806, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 17978 + }, + { + "epoch": 0.17979, + "grad_norm": 0.811799520487269, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 17979 + }, + { + "epoch": 0.1798, + "grad_norm": 0.922958204480545, + "learning_rate": 0.003, + "loss": 4.045, + "step": 17980 + }, + { + "epoch": 0.17981, + "grad_norm": 0.9743395951623662, + "learning_rate": 0.003, + "loss": 3.9885, + "step": 17981 + }, + { + "epoch": 0.17982, + "grad_norm": 1.373180609655959, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 17982 + }, + { + "epoch": 0.17983, + "grad_norm": 0.8168275487681272, + "learning_rate": 0.003, + "loss": 4.065, + "step": 17983 + }, + { + "epoch": 0.17984, + "grad_norm": 0.7080180162419921, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 17984 + }, + { + "epoch": 0.17985, + "grad_norm": 0.7940605541977659, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 17985 + }, + { + "epoch": 0.17986, + "grad_norm": 0.7254951582788487, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 17986 + }, + { + "epoch": 0.17987, + "grad_norm": 0.7448635415988921, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 17987 + }, + { + "epoch": 0.17988, + "grad_norm": 0.8047736846631103, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 17988 + }, + { + "epoch": 0.17989, + "grad_norm": 0.9574550527423038, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 17989 + }, + { + "epoch": 0.1799, + "grad_norm": 1.020786271331795, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 17990 + }, + { + "epoch": 0.17991, + "grad_norm": 0.9887784017834026, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 17991 + }, + { + "epoch": 0.17992, + "grad_norm": 0.9676698946785408, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 17992 + }, + { + "epoch": 0.17993, + "grad_norm": 0.8948717629191097, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 17993 + }, + { + "epoch": 0.17994, + "grad_norm": 0.7534672583024733, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 17994 + }, + { + "epoch": 0.17995, + "grad_norm": 0.788104936502091, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 17995 + }, + { + "epoch": 0.17996, + "grad_norm": 0.8128901529972176, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 17996 + }, + { + "epoch": 0.17997, + "grad_norm": 0.7882848770742896, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 17997 + }, + { + "epoch": 0.17998, + "grad_norm": 0.850474555041141, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 17998 + }, + { + "epoch": 0.17999, + "grad_norm": 0.9586233532033333, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 17999 + }, + { + "epoch": 0.18, + "grad_norm": 1.1400668480008858, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 18000 + }, + { + "epoch": 0.18001, + "grad_norm": 1.054919436998714, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 18001 + }, + { + "epoch": 0.18002, + "grad_norm": 1.0247121761574658, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 18002 + }, + { + "epoch": 0.18003, + "grad_norm": 1.054778871943613, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 18003 + }, + { + "epoch": 0.18004, + "grad_norm": 1.0080173050674024, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 18004 + }, + { + "epoch": 0.18005, + "grad_norm": 0.9872412135627638, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 18005 + }, + { + "epoch": 0.18006, + "grad_norm": 0.8924778948565202, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 18006 + }, + { + "epoch": 0.18007, + "grad_norm": 0.913301667115082, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 18007 + }, + { + "epoch": 0.18008, + "grad_norm": 1.0397859611221514, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 18008 + }, + { + "epoch": 0.18009, + "grad_norm": 1.1887474112560947, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 18009 + }, + { + "epoch": 0.1801, + "grad_norm": 0.8600276505610895, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 18010 + }, + { + "epoch": 0.18011, + "grad_norm": 0.8158139335514721, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 18011 + }, + { + "epoch": 0.18012, + "grad_norm": 0.8305582817254908, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 18012 + }, + { + "epoch": 0.18013, + "grad_norm": 0.8390759420132285, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 18013 + }, + { + "epoch": 0.18014, + "grad_norm": 0.8586288442501921, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 18014 + }, + { + "epoch": 0.18015, + "grad_norm": 0.8530469028643262, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 18015 + }, + { + "epoch": 0.18016, + "grad_norm": 0.9308114254337448, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 18016 + }, + { + "epoch": 0.18017, + "grad_norm": 0.88784790681411, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 18017 + }, + { + "epoch": 0.18018, + "grad_norm": 0.8238910140539516, + "learning_rate": 0.003, + "loss": 4.059, + "step": 18018 + }, + { + "epoch": 0.18019, + "grad_norm": 0.9016513070880283, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 18019 + }, + { + "epoch": 0.1802, + "grad_norm": 1.0920983725629796, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 18020 + }, + { + "epoch": 0.18021, + "grad_norm": 0.8839509402377498, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 18021 + }, + { + "epoch": 0.18022, + "grad_norm": 0.8766498380550272, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 18022 + }, + { + "epoch": 0.18023, + "grad_norm": 0.8968238559356412, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 18023 + }, + { + "epoch": 0.18024, + "grad_norm": 0.9562949134087456, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 18024 + }, + { + "epoch": 0.18025, + "grad_norm": 1.0678622849040766, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 18025 + }, + { + "epoch": 0.18026, + "grad_norm": 0.9914654337613729, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 18026 + }, + { + "epoch": 0.18027, + "grad_norm": 1.0105618268675878, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 18027 + }, + { + "epoch": 0.18028, + "grad_norm": 0.8558677540865376, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 18028 + }, + { + "epoch": 0.18029, + "grad_norm": 0.7216040695209736, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 18029 + }, + { + "epoch": 0.1803, + "grad_norm": 0.7175008770053735, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 18030 + }, + { + "epoch": 0.18031, + "grad_norm": 0.823598678229924, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 18031 + }, + { + "epoch": 0.18032, + "grad_norm": 1.0033425238366533, + "learning_rate": 0.003, + "loss": 4.1353, + "step": 18032 + }, + { + "epoch": 0.18033, + "grad_norm": 1.1497851437371438, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 18033 + }, + { + "epoch": 0.18034, + "grad_norm": 1.0287070729647791, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 18034 + }, + { + "epoch": 0.18035, + "grad_norm": 1.145467793290977, + "learning_rate": 0.003, + "loss": 4.075, + "step": 18035 + }, + { + "epoch": 0.18036, + "grad_norm": 0.9283426059463389, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 18036 + }, + { + "epoch": 0.18037, + "grad_norm": 0.7992388200430705, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 18037 + }, + { + "epoch": 0.18038, + "grad_norm": 0.8191202045319761, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 18038 + }, + { + "epoch": 0.18039, + "grad_norm": 0.886227146354526, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 18039 + }, + { + "epoch": 0.1804, + "grad_norm": 0.8643583852276007, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 18040 + }, + { + "epoch": 0.18041, + "grad_norm": 0.8838099635267671, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 18041 + }, + { + "epoch": 0.18042, + "grad_norm": 0.9005069709981848, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 18042 + }, + { + "epoch": 0.18043, + "grad_norm": 0.9236549671345337, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 18043 + }, + { + "epoch": 0.18044, + "grad_norm": 0.9911425335560223, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 18044 + }, + { + "epoch": 0.18045, + "grad_norm": 1.021211340428821, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 18045 + }, + { + "epoch": 0.18046, + "grad_norm": 1.0008010184678846, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 18046 + }, + { + "epoch": 0.18047, + "grad_norm": 0.9706143140605511, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 18047 + }, + { + "epoch": 0.18048, + "grad_norm": 0.8050953450731502, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 18048 + }, + { + "epoch": 0.18049, + "grad_norm": 0.7168957116698095, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 18049 + }, + { + "epoch": 0.1805, + "grad_norm": 0.763730355417067, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 18050 + }, + { + "epoch": 0.18051, + "grad_norm": 0.8392387215658648, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 18051 + }, + { + "epoch": 0.18052, + "grad_norm": 0.8756320277128591, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 18052 + }, + { + "epoch": 0.18053, + "grad_norm": 0.9882093781293488, + "learning_rate": 0.003, + "loss": 4.063, + "step": 18053 + }, + { + "epoch": 0.18054, + "grad_norm": 1.147024489035133, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 18054 + }, + { + "epoch": 0.18055, + "grad_norm": 0.9365598602714547, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 18055 + }, + { + "epoch": 0.18056, + "grad_norm": 1.0658319735069848, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 18056 + }, + { + "epoch": 0.18057, + "grad_norm": 1.2052768115217336, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 18057 + }, + { + "epoch": 0.18058, + "grad_norm": 0.8583027638126337, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 18058 + }, + { + "epoch": 0.18059, + "grad_norm": 0.7610489523254532, + "learning_rate": 0.003, + "loss": 4.05, + "step": 18059 + }, + { + "epoch": 0.1806, + "grad_norm": 0.6985702685875976, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 18060 + }, + { + "epoch": 0.18061, + "grad_norm": 0.5905272923550221, + "learning_rate": 0.003, + "loss": 4.065, + "step": 18061 + }, + { + "epoch": 0.18062, + "grad_norm": 0.6693835892537288, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 18062 + }, + { + "epoch": 0.18063, + "grad_norm": 0.642860341607339, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 18063 + }, + { + "epoch": 0.18064, + "grad_norm": 0.644897143297268, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 18064 + }, + { + "epoch": 0.18065, + "grad_norm": 0.6998940543634489, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 18065 + }, + { + "epoch": 0.18066, + "grad_norm": 0.7876390615700182, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 18066 + }, + { + "epoch": 0.18067, + "grad_norm": 0.8507104078395324, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 18067 + }, + { + "epoch": 0.18068, + "grad_norm": 0.7868838319668721, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 18068 + }, + { + "epoch": 0.18069, + "grad_norm": 0.9103956263232936, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 18069 + }, + { + "epoch": 0.1807, + "grad_norm": 1.216721313552926, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 18070 + }, + { + "epoch": 0.18071, + "grad_norm": 0.8443612196545752, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 18071 + }, + { + "epoch": 0.18072, + "grad_norm": 0.7473558386864037, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 18072 + }, + { + "epoch": 0.18073, + "grad_norm": 0.7736004856991765, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 18073 + }, + { + "epoch": 0.18074, + "grad_norm": 0.7245788317208545, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 18074 + }, + { + "epoch": 0.18075, + "grad_norm": 0.77661643464328, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 18075 + }, + { + "epoch": 0.18076, + "grad_norm": 1.0284922953510736, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 18076 + }, + { + "epoch": 0.18077, + "grad_norm": 1.14881967732802, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 18077 + }, + { + "epoch": 0.18078, + "grad_norm": 0.9006986801785335, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 18078 + }, + { + "epoch": 0.18079, + "grad_norm": 0.9641920751676571, + "learning_rate": 0.003, + "loss": 4.056, + "step": 18079 + }, + { + "epoch": 0.1808, + "grad_norm": 1.1083547323529184, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 18080 + }, + { + "epoch": 0.18081, + "grad_norm": 0.8394517315269832, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 18081 + }, + { + "epoch": 0.18082, + "grad_norm": 0.7896556419721917, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 18082 + }, + { + "epoch": 0.18083, + "grad_norm": 0.7437492272834548, + "learning_rate": 0.003, + "loss": 4.051, + "step": 18083 + }, + { + "epoch": 0.18084, + "grad_norm": 0.7350499511250812, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 18084 + }, + { + "epoch": 0.18085, + "grad_norm": 0.7586160703975404, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 18085 + }, + { + "epoch": 0.18086, + "grad_norm": 0.8605106520318582, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 18086 + }, + { + "epoch": 0.18087, + "grad_norm": 1.0523219277194693, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 18087 + }, + { + "epoch": 0.18088, + "grad_norm": 0.9540598632872929, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 18088 + }, + { + "epoch": 0.18089, + "grad_norm": 0.954635444988524, + "learning_rate": 0.003, + "loss": 4.037, + "step": 18089 + }, + { + "epoch": 0.1809, + "grad_norm": 1.0242917218797458, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 18090 + }, + { + "epoch": 0.18091, + "grad_norm": 1.038614160368774, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 18091 + }, + { + "epoch": 0.18092, + "grad_norm": 0.857783444613349, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 18092 + }, + { + "epoch": 0.18093, + "grad_norm": 0.7140591390077858, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 18093 + }, + { + "epoch": 0.18094, + "grad_norm": 0.7482695580063535, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 18094 + }, + { + "epoch": 0.18095, + "grad_norm": 0.7385178452807232, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 18095 + }, + { + "epoch": 0.18096, + "grad_norm": 0.8392621298958493, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 18096 + }, + { + "epoch": 0.18097, + "grad_norm": 0.8916954202110989, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 18097 + }, + { + "epoch": 0.18098, + "grad_norm": 1.0075924604629982, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 18098 + }, + { + "epoch": 0.18099, + "grad_norm": 0.9911745971497355, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 18099 + }, + { + "epoch": 0.181, + "grad_norm": 0.9567441446592772, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 18100 + }, + { + "epoch": 0.18101, + "grad_norm": 0.9617819746104196, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 18101 + }, + { + "epoch": 0.18102, + "grad_norm": 1.010570390083101, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 18102 + }, + { + "epoch": 0.18103, + "grad_norm": 0.9373306937056151, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 18103 + }, + { + "epoch": 0.18104, + "grad_norm": 0.9270389789703667, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 18104 + }, + { + "epoch": 0.18105, + "grad_norm": 1.08425137493349, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 18105 + }, + { + "epoch": 0.18106, + "grad_norm": 1.195912316352079, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 18106 + }, + { + "epoch": 0.18107, + "grad_norm": 1.004636464485103, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 18107 + }, + { + "epoch": 0.18108, + "grad_norm": 0.9940175610961242, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 18108 + }, + { + "epoch": 0.18109, + "grad_norm": 0.9927972978176638, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 18109 + }, + { + "epoch": 0.1811, + "grad_norm": 1.1097166512494476, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 18110 + }, + { + "epoch": 0.18111, + "grad_norm": 0.8813473769855856, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 18111 + }, + { + "epoch": 0.18112, + "grad_norm": 0.8964082186183482, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 18112 + }, + { + "epoch": 0.18113, + "grad_norm": 0.950768700593564, + "learning_rate": 0.003, + "loss": 4.106, + "step": 18113 + }, + { + "epoch": 0.18114, + "grad_norm": 0.9586280817607733, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 18114 + }, + { + "epoch": 0.18115, + "grad_norm": 0.9933290778334918, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 18115 + }, + { + "epoch": 0.18116, + "grad_norm": 1.0564887509087377, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 18116 + }, + { + "epoch": 0.18117, + "grad_norm": 1.0569225404214955, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 18117 + }, + { + "epoch": 0.18118, + "grad_norm": 1.0252559312834126, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 18118 + }, + { + "epoch": 0.18119, + "grad_norm": 0.9721326520180619, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 18119 + }, + { + "epoch": 0.1812, + "grad_norm": 0.8857108000225521, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 18120 + }, + { + "epoch": 0.18121, + "grad_norm": 0.8609578637849452, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 18121 + }, + { + "epoch": 0.18122, + "grad_norm": 0.9725015666055711, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 18122 + }, + { + "epoch": 0.18123, + "grad_norm": 0.9505304188010881, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 18123 + }, + { + "epoch": 0.18124, + "grad_norm": 0.8048063134000035, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 18124 + }, + { + "epoch": 0.18125, + "grad_norm": 0.7864391093644963, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 18125 + }, + { + "epoch": 0.18126, + "grad_norm": 0.8107160198696282, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 18126 + }, + { + "epoch": 0.18127, + "grad_norm": 0.805285737641391, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 18127 + }, + { + "epoch": 0.18128, + "grad_norm": 0.8503046943944598, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 18128 + }, + { + "epoch": 0.18129, + "grad_norm": 0.9339308416554004, + "learning_rate": 0.003, + "loss": 4.046, + "step": 18129 + }, + { + "epoch": 0.1813, + "grad_norm": 1.0167343887423093, + "learning_rate": 0.003, + "loss": 4.063, + "step": 18130 + }, + { + "epoch": 0.18131, + "grad_norm": 1.083885297834378, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 18131 + }, + { + "epoch": 0.18132, + "grad_norm": 0.7863781208744626, + "learning_rate": 0.003, + "loss": 4.075, + "step": 18132 + }, + { + "epoch": 0.18133, + "grad_norm": 0.6305209627716251, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 18133 + }, + { + "epoch": 0.18134, + "grad_norm": 0.6383779621138967, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 18134 + }, + { + "epoch": 0.18135, + "grad_norm": 0.6245947534556049, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 18135 + }, + { + "epoch": 0.18136, + "grad_norm": 0.5333880341479986, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 18136 + }, + { + "epoch": 0.18137, + "grad_norm": 0.5480307472237741, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 18137 + }, + { + "epoch": 0.18138, + "grad_norm": 0.5766572580782144, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 18138 + }, + { + "epoch": 0.18139, + "grad_norm": 0.6721878089281043, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 18139 + }, + { + "epoch": 0.1814, + "grad_norm": 0.8138944507771368, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 18140 + }, + { + "epoch": 0.18141, + "grad_norm": 0.9633570397696866, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 18141 + }, + { + "epoch": 0.18142, + "grad_norm": 1.0664658924456716, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 18142 + }, + { + "epoch": 0.18143, + "grad_norm": 1.1136573459458181, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 18143 + }, + { + "epoch": 0.18144, + "grad_norm": 0.8628506049356274, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 18144 + }, + { + "epoch": 0.18145, + "grad_norm": 0.7642134080266269, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 18145 + }, + { + "epoch": 0.18146, + "grad_norm": 0.8484411142804493, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 18146 + }, + { + "epoch": 0.18147, + "grad_norm": 0.8215187819624383, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 18147 + }, + { + "epoch": 0.18148, + "grad_norm": 0.7898606748433694, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 18148 + }, + { + "epoch": 0.18149, + "grad_norm": 0.9865450042214753, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 18149 + }, + { + "epoch": 0.1815, + "grad_norm": 1.2419376894105545, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 18150 + }, + { + "epoch": 0.18151, + "grad_norm": 0.9270812697724077, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 18151 + }, + { + "epoch": 0.18152, + "grad_norm": 0.9479380298289326, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 18152 + }, + { + "epoch": 0.18153, + "grad_norm": 1.0005480471613615, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 18153 + }, + { + "epoch": 0.18154, + "grad_norm": 1.016867069998641, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 18154 + }, + { + "epoch": 0.18155, + "grad_norm": 0.8489247722993403, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 18155 + }, + { + "epoch": 0.18156, + "grad_norm": 0.8186962274083154, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 18156 + }, + { + "epoch": 0.18157, + "grad_norm": 0.7540143087358937, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 18157 + }, + { + "epoch": 0.18158, + "grad_norm": 0.7419636605484827, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 18158 + }, + { + "epoch": 0.18159, + "grad_norm": 0.802284595751655, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 18159 + }, + { + "epoch": 0.1816, + "grad_norm": 0.9385428012280234, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 18160 + }, + { + "epoch": 0.18161, + "grad_norm": 0.9717302322199527, + "learning_rate": 0.003, + "loss": 4.07, + "step": 18161 + }, + { + "epoch": 0.18162, + "grad_norm": 0.8754633514842903, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 18162 + }, + { + "epoch": 0.18163, + "grad_norm": 0.8396510209304644, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 18163 + }, + { + "epoch": 0.18164, + "grad_norm": 0.955555668213817, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 18164 + }, + { + "epoch": 0.18165, + "grad_norm": 1.137697207615346, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 18165 + }, + { + "epoch": 0.18166, + "grad_norm": 1.0881620225161932, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 18166 + }, + { + "epoch": 0.18167, + "grad_norm": 0.9485067426620597, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 18167 + }, + { + "epoch": 0.18168, + "grad_norm": 0.9243580245057562, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 18168 + }, + { + "epoch": 0.18169, + "grad_norm": 1.0203908542125293, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 18169 + }, + { + "epoch": 0.1817, + "grad_norm": 1.09077537303057, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 18170 + }, + { + "epoch": 0.18171, + "grad_norm": 0.9325823985592244, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 18171 + }, + { + "epoch": 0.18172, + "grad_norm": 0.915691801723223, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 18172 + }, + { + "epoch": 0.18173, + "grad_norm": 1.0059008108083742, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 18173 + }, + { + "epoch": 0.18174, + "grad_norm": 1.0551749379975364, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 18174 + }, + { + "epoch": 0.18175, + "grad_norm": 0.8984459911393465, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 18175 + }, + { + "epoch": 0.18176, + "grad_norm": 0.7819216793924464, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 18176 + }, + { + "epoch": 0.18177, + "grad_norm": 0.7210490214467581, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 18177 + }, + { + "epoch": 0.18178, + "grad_norm": 0.7012450839347708, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 18178 + }, + { + "epoch": 0.18179, + "grad_norm": 0.7692705758043559, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 18179 + }, + { + "epoch": 0.1818, + "grad_norm": 0.7210100101828922, + "learning_rate": 0.003, + "loss": 4.06, + "step": 18180 + }, + { + "epoch": 0.18181, + "grad_norm": 0.7500848601154694, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 18181 + }, + { + "epoch": 0.18182, + "grad_norm": 0.7321087967261691, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 18182 + }, + { + "epoch": 0.18183, + "grad_norm": 0.8023509999946052, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 18183 + }, + { + "epoch": 0.18184, + "grad_norm": 1.0483452254588141, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 18184 + }, + { + "epoch": 0.18185, + "grad_norm": 1.1355790294194081, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 18185 + }, + { + "epoch": 0.18186, + "grad_norm": 0.6902671569504694, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 18186 + }, + { + "epoch": 0.18187, + "grad_norm": 0.6531811095563804, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 18187 + }, + { + "epoch": 0.18188, + "grad_norm": 0.7234975208818184, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 18188 + }, + { + "epoch": 0.18189, + "grad_norm": 0.7301938424591938, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 18189 + }, + { + "epoch": 0.1819, + "grad_norm": 0.7773847873457516, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 18190 + }, + { + "epoch": 0.18191, + "grad_norm": 0.6645527794024358, + "learning_rate": 0.003, + "loss": 4.049, + "step": 18191 + }, + { + "epoch": 0.18192, + "grad_norm": 0.5967643259583706, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 18192 + }, + { + "epoch": 0.18193, + "grad_norm": 0.6782147612719742, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 18193 + }, + { + "epoch": 0.18194, + "grad_norm": 0.7955734597934974, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 18194 + }, + { + "epoch": 0.18195, + "grad_norm": 0.9581035879244924, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 18195 + }, + { + "epoch": 0.18196, + "grad_norm": 0.9299283102512945, + "learning_rate": 0.003, + "loss": 4.048, + "step": 18196 + }, + { + "epoch": 0.18197, + "grad_norm": 0.9942387648297777, + "learning_rate": 0.003, + "loss": 4.058, + "step": 18197 + }, + { + "epoch": 0.18198, + "grad_norm": 1.354846241094494, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 18198 + }, + { + "epoch": 0.18199, + "grad_norm": 0.8961845114651893, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 18199 + }, + { + "epoch": 0.182, + "grad_norm": 0.9161895535538193, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 18200 + }, + { + "epoch": 0.18201, + "grad_norm": 0.9347028076466997, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 18201 + }, + { + "epoch": 0.18202, + "grad_norm": 1.1339752903572067, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 18202 + }, + { + "epoch": 0.18203, + "grad_norm": 0.9935774237436475, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 18203 + }, + { + "epoch": 0.18204, + "grad_norm": 1.119459686939325, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 18204 + }, + { + "epoch": 0.18205, + "grad_norm": 0.9604229229376183, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 18205 + }, + { + "epoch": 0.18206, + "grad_norm": 1.0787143652624855, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 18206 + }, + { + "epoch": 0.18207, + "grad_norm": 0.9989436306033681, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 18207 + }, + { + "epoch": 0.18208, + "grad_norm": 0.9745983499062763, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 18208 + }, + { + "epoch": 0.18209, + "grad_norm": 0.8481239034032982, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 18209 + }, + { + "epoch": 0.1821, + "grad_norm": 0.7248993858698688, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 18210 + }, + { + "epoch": 0.18211, + "grad_norm": 0.6908269403481287, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 18211 + }, + { + "epoch": 0.18212, + "grad_norm": 0.6959265942355016, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 18212 + }, + { + "epoch": 0.18213, + "grad_norm": 0.6945147820887164, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 18213 + }, + { + "epoch": 0.18214, + "grad_norm": 0.6266237949266471, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 18214 + }, + { + "epoch": 0.18215, + "grad_norm": 0.6170896985720568, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 18215 + }, + { + "epoch": 0.18216, + "grad_norm": 0.6368359625575057, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 18216 + }, + { + "epoch": 0.18217, + "grad_norm": 0.6702022147134921, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 18217 + }, + { + "epoch": 0.18218, + "grad_norm": 0.8823613952334576, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 18218 + }, + { + "epoch": 0.18219, + "grad_norm": 1.4070313072252556, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 18219 + }, + { + "epoch": 0.1822, + "grad_norm": 0.7552854162567318, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 18220 + }, + { + "epoch": 0.18221, + "grad_norm": 0.6885545263331007, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 18221 + }, + { + "epoch": 0.18222, + "grad_norm": 0.6904079633572755, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 18222 + }, + { + "epoch": 0.18223, + "grad_norm": 0.7200929084130898, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 18223 + }, + { + "epoch": 0.18224, + "grad_norm": 0.7455347241748624, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 18224 + }, + { + "epoch": 0.18225, + "grad_norm": 0.8569416229175616, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 18225 + }, + { + "epoch": 0.18226, + "grad_norm": 0.9831236584861761, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 18226 + }, + { + "epoch": 0.18227, + "grad_norm": 1.2538086527772438, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 18227 + }, + { + "epoch": 0.18228, + "grad_norm": 0.8882797801107741, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 18228 + }, + { + "epoch": 0.18229, + "grad_norm": 0.8519735158492483, + "learning_rate": 0.003, + "loss": 4.042, + "step": 18229 + }, + { + "epoch": 0.1823, + "grad_norm": 0.8445283227191879, + "learning_rate": 0.003, + "loss": 4.075, + "step": 18230 + }, + { + "epoch": 0.18231, + "grad_norm": 0.9797837087003988, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 18231 + }, + { + "epoch": 0.18232, + "grad_norm": 1.1221256809356586, + "learning_rate": 0.003, + "loss": 4.069, + "step": 18232 + }, + { + "epoch": 0.18233, + "grad_norm": 0.8903268120362483, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 18233 + }, + { + "epoch": 0.18234, + "grad_norm": 1.0158494552810846, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 18234 + }, + { + "epoch": 0.18235, + "grad_norm": 1.1066567935006122, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 18235 + }, + { + "epoch": 0.18236, + "grad_norm": 0.6874891280868245, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 18236 + }, + { + "epoch": 0.18237, + "grad_norm": 0.6705247786577884, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 18237 + }, + { + "epoch": 0.18238, + "grad_norm": 0.7732181362741339, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 18238 + }, + { + "epoch": 0.18239, + "grad_norm": 0.7754611545539052, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 18239 + }, + { + "epoch": 0.1824, + "grad_norm": 0.9344975763258625, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 18240 + }, + { + "epoch": 0.18241, + "grad_norm": 1.2832078448609356, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 18241 + }, + { + "epoch": 0.18242, + "grad_norm": 0.9556987296668182, + "learning_rate": 0.003, + "loss": 4.023, + "step": 18242 + }, + { + "epoch": 0.18243, + "grad_norm": 0.9043533022376008, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 18243 + }, + { + "epoch": 0.18244, + "grad_norm": 0.9146811059154532, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 18244 + }, + { + "epoch": 0.18245, + "grad_norm": 1.1334886768689365, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 18245 + }, + { + "epoch": 0.18246, + "grad_norm": 0.9872644352464646, + "learning_rate": 0.003, + "loss": 4.053, + "step": 18246 + }, + { + "epoch": 0.18247, + "grad_norm": 0.9746216042009118, + "learning_rate": 0.003, + "loss": 4.065, + "step": 18247 + }, + { + "epoch": 0.18248, + "grad_norm": 0.873982634022835, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 18248 + }, + { + "epoch": 0.18249, + "grad_norm": 0.7966570786614937, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 18249 + }, + { + "epoch": 0.1825, + "grad_norm": 0.8541605241880963, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 18250 + }, + { + "epoch": 0.18251, + "grad_norm": 0.8219705222647181, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 18251 + }, + { + "epoch": 0.18252, + "grad_norm": 0.8552633058511083, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 18252 + }, + { + "epoch": 0.18253, + "grad_norm": 0.8627514450069399, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 18253 + }, + { + "epoch": 0.18254, + "grad_norm": 0.8435662291486822, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 18254 + }, + { + "epoch": 0.18255, + "grad_norm": 0.8874599992740497, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 18255 + }, + { + "epoch": 0.18256, + "grad_norm": 1.124151888645897, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 18256 + }, + { + "epoch": 0.18257, + "grad_norm": 0.9881054341075414, + "learning_rate": 0.003, + "loss": 4.069, + "step": 18257 + }, + { + "epoch": 0.18258, + "grad_norm": 0.9111781193843494, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 18258 + }, + { + "epoch": 0.18259, + "grad_norm": 0.9461466892318765, + "learning_rate": 0.003, + "loss": 4.065, + "step": 18259 + }, + { + "epoch": 0.1826, + "grad_norm": 0.9910200896152757, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 18260 + }, + { + "epoch": 0.18261, + "grad_norm": 1.0945011675228884, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 18261 + }, + { + "epoch": 0.18262, + "grad_norm": 0.9424357588602961, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 18262 + }, + { + "epoch": 0.18263, + "grad_norm": 0.8973101958045837, + "learning_rate": 0.003, + "loss": 4.069, + "step": 18263 + }, + { + "epoch": 0.18264, + "grad_norm": 0.8859311667384303, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 18264 + }, + { + "epoch": 0.18265, + "grad_norm": 0.8538222882398226, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 18265 + }, + { + "epoch": 0.18266, + "grad_norm": 0.7469743685221801, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 18266 + }, + { + "epoch": 0.18267, + "grad_norm": 0.7634378210328012, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 18267 + }, + { + "epoch": 0.18268, + "grad_norm": 0.8821616565620666, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 18268 + }, + { + "epoch": 0.18269, + "grad_norm": 0.9031687267940208, + "learning_rate": 0.003, + "loss": 4.074, + "step": 18269 + }, + { + "epoch": 0.1827, + "grad_norm": 0.8377298491836317, + "learning_rate": 0.003, + "loss": 4.054, + "step": 18270 + }, + { + "epoch": 0.18271, + "grad_norm": 0.9628602826310017, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 18271 + }, + { + "epoch": 0.18272, + "grad_norm": 0.848731983485719, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 18272 + }, + { + "epoch": 0.18273, + "grad_norm": 0.78200864179445, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 18273 + }, + { + "epoch": 0.18274, + "grad_norm": 0.8714591997469179, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 18274 + }, + { + "epoch": 0.18275, + "grad_norm": 0.9714147257975454, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 18275 + }, + { + "epoch": 0.18276, + "grad_norm": 1.1434623688571923, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 18276 + }, + { + "epoch": 0.18277, + "grad_norm": 1.0768484423778402, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 18277 + }, + { + "epoch": 0.18278, + "grad_norm": 1.023875330268496, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 18278 + }, + { + "epoch": 0.18279, + "grad_norm": 1.0239789661246292, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 18279 + }, + { + "epoch": 0.1828, + "grad_norm": 0.9856963577577317, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 18280 + }, + { + "epoch": 0.18281, + "grad_norm": 0.8789880468350629, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 18281 + }, + { + "epoch": 0.18282, + "grad_norm": 0.8511356093129471, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 18282 + }, + { + "epoch": 0.18283, + "grad_norm": 0.920479915384502, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 18283 + }, + { + "epoch": 0.18284, + "grad_norm": 0.8594703754993123, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 18284 + }, + { + "epoch": 0.18285, + "grad_norm": 0.7253504130445864, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 18285 + }, + { + "epoch": 0.18286, + "grad_norm": 0.8583690051592623, + "learning_rate": 0.003, + "loss": 4.062, + "step": 18286 + }, + { + "epoch": 0.18287, + "grad_norm": 0.7911440901548231, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 18287 + }, + { + "epoch": 0.18288, + "grad_norm": 0.776923762306454, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 18288 + }, + { + "epoch": 0.18289, + "grad_norm": 0.7842037109411842, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 18289 + }, + { + "epoch": 0.1829, + "grad_norm": 0.7133961788360071, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 18290 + }, + { + "epoch": 0.18291, + "grad_norm": 0.8516798753726228, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 18291 + }, + { + "epoch": 0.18292, + "grad_norm": 0.996869566368745, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 18292 + }, + { + "epoch": 0.18293, + "grad_norm": 1.2170779467026525, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 18293 + }, + { + "epoch": 0.18294, + "grad_norm": 0.9956472849566619, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 18294 + }, + { + "epoch": 0.18295, + "grad_norm": 1.0407446679918768, + "learning_rate": 0.003, + "loss": 4.063, + "step": 18295 + }, + { + "epoch": 0.18296, + "grad_norm": 1.0606650933847244, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 18296 + }, + { + "epoch": 0.18297, + "grad_norm": 0.8768720508475318, + "learning_rate": 0.003, + "loss": 4.081, + "step": 18297 + }, + { + "epoch": 0.18298, + "grad_norm": 0.866777662801284, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 18298 + }, + { + "epoch": 0.18299, + "grad_norm": 0.9263658840199724, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 18299 + }, + { + "epoch": 0.183, + "grad_norm": 1.0691709281154491, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 18300 + }, + { + "epoch": 0.18301, + "grad_norm": 1.2485516906991745, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 18301 + }, + { + "epoch": 0.18302, + "grad_norm": 0.9049009757806349, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 18302 + }, + { + "epoch": 0.18303, + "grad_norm": 0.803290980741296, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 18303 + }, + { + "epoch": 0.18304, + "grad_norm": 0.8004407740269954, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 18304 + }, + { + "epoch": 0.18305, + "grad_norm": 0.8689812640157072, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 18305 + }, + { + "epoch": 0.18306, + "grad_norm": 0.8692417606954554, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 18306 + }, + { + "epoch": 0.18307, + "grad_norm": 0.9789031146923883, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 18307 + }, + { + "epoch": 0.18308, + "grad_norm": 1.1844961302853967, + "learning_rate": 0.003, + "loss": 4.056, + "step": 18308 + }, + { + "epoch": 0.18309, + "grad_norm": 0.9856975048478858, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 18309 + }, + { + "epoch": 0.1831, + "grad_norm": 0.9390258538029391, + "learning_rate": 0.003, + "loss": 4.071, + "step": 18310 + }, + { + "epoch": 0.18311, + "grad_norm": 0.8731181926448102, + "learning_rate": 0.003, + "loss": 4.053, + "step": 18311 + }, + { + "epoch": 0.18312, + "grad_norm": 0.8153171222244627, + "learning_rate": 0.003, + "loss": 4.08, + "step": 18312 + }, + { + "epoch": 0.18313, + "grad_norm": 0.7563976129452407, + "learning_rate": 0.003, + "loss": 3.988, + "step": 18313 + }, + { + "epoch": 0.18314, + "grad_norm": 0.7758783584421425, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 18314 + }, + { + "epoch": 0.18315, + "grad_norm": 0.6732592622665557, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 18315 + }, + { + "epoch": 0.18316, + "grad_norm": 0.7433584654616154, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 18316 + }, + { + "epoch": 0.18317, + "grad_norm": 0.8370717529734161, + "learning_rate": 0.003, + "loss": 4.07, + "step": 18317 + }, + { + "epoch": 0.18318, + "grad_norm": 0.9934388495146806, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 18318 + }, + { + "epoch": 0.18319, + "grad_norm": 1.238996126516156, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 18319 + }, + { + "epoch": 0.1832, + "grad_norm": 0.6814958351543677, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 18320 + }, + { + "epoch": 0.18321, + "grad_norm": 0.6738831987359615, + "learning_rate": 0.003, + "loss": 4.116, + "step": 18321 + }, + { + "epoch": 0.18322, + "grad_norm": 0.7257307701617576, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 18322 + }, + { + "epoch": 0.18323, + "grad_norm": 0.7858957715058614, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 18323 + }, + { + "epoch": 0.18324, + "grad_norm": 0.90781181482303, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 18324 + }, + { + "epoch": 0.18325, + "grad_norm": 0.9381424590321324, + "learning_rate": 0.003, + "loss": 4.053, + "step": 18325 + }, + { + "epoch": 0.18326, + "grad_norm": 1.0111861670278783, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 18326 + }, + { + "epoch": 0.18327, + "grad_norm": 1.0206278346312894, + "learning_rate": 0.003, + "loss": 4.057, + "step": 18327 + }, + { + "epoch": 0.18328, + "grad_norm": 0.8937175824766302, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 18328 + }, + { + "epoch": 0.18329, + "grad_norm": 0.7956703126603709, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 18329 + }, + { + "epoch": 0.1833, + "grad_norm": 0.6609151533793772, + "learning_rate": 0.003, + "loss": 4.089, + "step": 18330 + }, + { + "epoch": 0.18331, + "grad_norm": 0.8023424956614408, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 18331 + }, + { + "epoch": 0.18332, + "grad_norm": 0.9444500003668816, + "learning_rate": 0.003, + "loss": 4.045, + "step": 18332 + }, + { + "epoch": 0.18333, + "grad_norm": 1.1232820292954713, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 18333 + }, + { + "epoch": 0.18334, + "grad_norm": 0.925771832016891, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 18334 + }, + { + "epoch": 0.18335, + "grad_norm": 0.90350674124472, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 18335 + }, + { + "epoch": 0.18336, + "grad_norm": 0.8962820985223595, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 18336 + }, + { + "epoch": 0.18337, + "grad_norm": 0.8742216244400192, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 18337 + }, + { + "epoch": 0.18338, + "grad_norm": 0.9489693468394845, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 18338 + }, + { + "epoch": 0.18339, + "grad_norm": 1.1047154507497938, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 18339 + }, + { + "epoch": 0.1834, + "grad_norm": 0.9738272548357211, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 18340 + }, + { + "epoch": 0.18341, + "grad_norm": 0.9289877084843211, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 18341 + }, + { + "epoch": 0.18342, + "grad_norm": 1.0034615603372803, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 18342 + }, + { + "epoch": 0.18343, + "grad_norm": 1.0869913939659064, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 18343 + }, + { + "epoch": 0.18344, + "grad_norm": 1.0326968476863894, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 18344 + }, + { + "epoch": 0.18345, + "grad_norm": 0.9365309959175494, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 18345 + }, + { + "epoch": 0.18346, + "grad_norm": 0.8441916266736671, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 18346 + }, + { + "epoch": 0.18347, + "grad_norm": 0.8448971241023325, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 18347 + }, + { + "epoch": 0.18348, + "grad_norm": 0.844413538370678, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 18348 + }, + { + "epoch": 0.18349, + "grad_norm": 1.180133984849724, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 18349 + }, + { + "epoch": 0.1835, + "grad_norm": 0.9807553149104413, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 18350 + }, + { + "epoch": 0.18351, + "grad_norm": 0.9538475469940382, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 18351 + }, + { + "epoch": 0.18352, + "grad_norm": 0.920438568997827, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 18352 + }, + { + "epoch": 0.18353, + "grad_norm": 0.9507661433634037, + "learning_rate": 0.003, + "loss": 4.071, + "step": 18353 + }, + { + "epoch": 0.18354, + "grad_norm": 0.9419113362763344, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 18354 + }, + { + "epoch": 0.18355, + "grad_norm": 0.9079770424794629, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 18355 + }, + { + "epoch": 0.18356, + "grad_norm": 0.7753937404092256, + "learning_rate": 0.003, + "loss": 4.041, + "step": 18356 + }, + { + "epoch": 0.18357, + "grad_norm": 0.8413296240917386, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 18357 + }, + { + "epoch": 0.18358, + "grad_norm": 0.9334011147106239, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 18358 + }, + { + "epoch": 0.18359, + "grad_norm": 1.0598753744361773, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 18359 + }, + { + "epoch": 0.1836, + "grad_norm": 0.8700862978204383, + "learning_rate": 0.003, + "loss": 4.062, + "step": 18360 + }, + { + "epoch": 0.18361, + "grad_norm": 0.7775217484779283, + "learning_rate": 0.003, + "loss": 4.033, + "step": 18361 + }, + { + "epoch": 0.18362, + "grad_norm": 0.8475541836531145, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 18362 + }, + { + "epoch": 0.18363, + "grad_norm": 1.002621330099506, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 18363 + }, + { + "epoch": 0.18364, + "grad_norm": 1.0121652612178136, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 18364 + }, + { + "epoch": 0.18365, + "grad_norm": 0.896934730668961, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 18365 + }, + { + "epoch": 0.18366, + "grad_norm": 0.700411786910494, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 18366 + }, + { + "epoch": 0.18367, + "grad_norm": 0.7200053737541005, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 18367 + }, + { + "epoch": 0.18368, + "grad_norm": 0.7379024769412605, + "learning_rate": 0.003, + "loss": 4.037, + "step": 18368 + }, + { + "epoch": 0.18369, + "grad_norm": 0.7369079470654286, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 18369 + }, + { + "epoch": 0.1837, + "grad_norm": 0.675034788221465, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 18370 + }, + { + "epoch": 0.18371, + "grad_norm": 0.6761825233019262, + "learning_rate": 0.003, + "loss": 4.028, + "step": 18371 + }, + { + "epoch": 0.18372, + "grad_norm": 0.73695990545588, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 18372 + }, + { + "epoch": 0.18373, + "grad_norm": 0.8779304698659591, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 18373 + }, + { + "epoch": 0.18374, + "grad_norm": 0.9574820279518487, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 18374 + }, + { + "epoch": 0.18375, + "grad_norm": 1.0082288952455414, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 18375 + }, + { + "epoch": 0.18376, + "grad_norm": 1.0789011139422715, + "learning_rate": 0.003, + "loss": 4.079, + "step": 18376 + }, + { + "epoch": 0.18377, + "grad_norm": 0.8889600977719888, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 18377 + }, + { + "epoch": 0.18378, + "grad_norm": 0.7369730004655919, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 18378 + }, + { + "epoch": 0.18379, + "grad_norm": 0.7006050753014715, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 18379 + }, + { + "epoch": 0.1838, + "grad_norm": 0.7135330918358853, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 18380 + }, + { + "epoch": 0.18381, + "grad_norm": 0.7277103801109799, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 18381 + }, + { + "epoch": 0.18382, + "grad_norm": 0.7512859891377555, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 18382 + }, + { + "epoch": 0.18383, + "grad_norm": 0.7100037046870975, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 18383 + }, + { + "epoch": 0.18384, + "grad_norm": 0.7845754390711044, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 18384 + }, + { + "epoch": 0.18385, + "grad_norm": 0.8088905780895999, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 18385 + }, + { + "epoch": 0.18386, + "grad_norm": 0.9519916077831818, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 18386 + }, + { + "epoch": 0.18387, + "grad_norm": 1.0850979765636848, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 18387 + }, + { + "epoch": 0.18388, + "grad_norm": 1.018253926346212, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 18388 + }, + { + "epoch": 0.18389, + "grad_norm": 0.9685745457080133, + "learning_rate": 0.003, + "loss": 4.075, + "step": 18389 + }, + { + "epoch": 0.1839, + "grad_norm": 0.9997793727571137, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 18390 + }, + { + "epoch": 0.18391, + "grad_norm": 1.1096756427692092, + "learning_rate": 0.003, + "loss": 4.05, + "step": 18391 + }, + { + "epoch": 0.18392, + "grad_norm": 1.0053052369757307, + "learning_rate": 0.003, + "loss": 4.035, + "step": 18392 + }, + { + "epoch": 0.18393, + "grad_norm": 1.0907581583031685, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 18393 + }, + { + "epoch": 0.18394, + "grad_norm": 0.9485181197151356, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 18394 + }, + { + "epoch": 0.18395, + "grad_norm": 1.0125317780175331, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 18395 + }, + { + "epoch": 0.18396, + "grad_norm": 1.0589857597585408, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 18396 + }, + { + "epoch": 0.18397, + "grad_norm": 0.9428369261269266, + "learning_rate": 0.003, + "loss": 4.089, + "step": 18397 + }, + { + "epoch": 0.18398, + "grad_norm": 0.9058252684157386, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 18398 + }, + { + "epoch": 0.18399, + "grad_norm": 0.9004445411384819, + "learning_rate": 0.003, + "loss": 4.056, + "step": 18399 + }, + { + "epoch": 0.184, + "grad_norm": 0.8888019074005667, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 18400 + }, + { + "epoch": 0.18401, + "grad_norm": 0.9103513397753971, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 18401 + }, + { + "epoch": 0.18402, + "grad_norm": 1.0078726060095158, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 18402 + }, + { + "epoch": 0.18403, + "grad_norm": 1.2079609013253374, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 18403 + }, + { + "epoch": 0.18404, + "grad_norm": 1.0349328044208834, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 18404 + }, + { + "epoch": 0.18405, + "grad_norm": 1.0252836146503492, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 18405 + }, + { + "epoch": 0.18406, + "grad_norm": 1.0518917905917498, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 18406 + }, + { + "epoch": 0.18407, + "grad_norm": 0.9447576952333357, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 18407 + }, + { + "epoch": 0.18408, + "grad_norm": 0.950840379862803, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 18408 + }, + { + "epoch": 0.18409, + "grad_norm": 0.9081149147449998, + "learning_rate": 0.003, + "loss": 4.084, + "step": 18409 + }, + { + "epoch": 0.1841, + "grad_norm": 0.946756086650973, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 18410 + }, + { + "epoch": 0.18411, + "grad_norm": 1.0261992049981374, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 18411 + }, + { + "epoch": 0.18412, + "grad_norm": 1.0374419767202754, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 18412 + }, + { + "epoch": 0.18413, + "grad_norm": 0.938850828370222, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 18413 + }, + { + "epoch": 0.18414, + "grad_norm": 0.9428837509698571, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 18414 + }, + { + "epoch": 0.18415, + "grad_norm": 1.016081854555453, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 18415 + }, + { + "epoch": 0.18416, + "grad_norm": 0.932861505724795, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 18416 + }, + { + "epoch": 0.18417, + "grad_norm": 0.7944634969291409, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 18417 + }, + { + "epoch": 0.18418, + "grad_norm": 0.7436733513208756, + "learning_rate": 0.003, + "loss": 4.042, + "step": 18418 + }, + { + "epoch": 0.18419, + "grad_norm": 0.7619970817084877, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 18419 + }, + { + "epoch": 0.1842, + "grad_norm": 0.7726319078385481, + "learning_rate": 0.003, + "loss": 4.085, + "step": 18420 + }, + { + "epoch": 0.18421, + "grad_norm": 0.8303853659853697, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 18421 + }, + { + "epoch": 0.18422, + "grad_norm": 0.9984712080761585, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 18422 + }, + { + "epoch": 0.18423, + "grad_norm": 1.2033989132315623, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 18423 + }, + { + "epoch": 0.18424, + "grad_norm": 0.9742790495383061, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 18424 + }, + { + "epoch": 0.18425, + "grad_norm": 0.927499385772444, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 18425 + }, + { + "epoch": 0.18426, + "grad_norm": 0.8717863685698273, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 18426 + }, + { + "epoch": 0.18427, + "grad_norm": 0.889216761621203, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 18427 + }, + { + "epoch": 0.18428, + "grad_norm": 0.9016093072479948, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 18428 + }, + { + "epoch": 0.18429, + "grad_norm": 0.8894854111643814, + "learning_rate": 0.003, + "loss": 4.077, + "step": 18429 + }, + { + "epoch": 0.1843, + "grad_norm": 0.7999202321666762, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 18430 + }, + { + "epoch": 0.18431, + "grad_norm": 0.8938771279461828, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 18431 + }, + { + "epoch": 0.18432, + "grad_norm": 0.8099028692557037, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 18432 + }, + { + "epoch": 0.18433, + "grad_norm": 0.9289813584587582, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 18433 + }, + { + "epoch": 0.18434, + "grad_norm": 1.0143284757428352, + "learning_rate": 0.003, + "loss": 4.063, + "step": 18434 + }, + { + "epoch": 0.18435, + "grad_norm": 1.121348756495198, + "learning_rate": 0.003, + "loss": 4.071, + "step": 18435 + }, + { + "epoch": 0.18436, + "grad_norm": 0.7415758261715508, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 18436 + }, + { + "epoch": 0.18437, + "grad_norm": 0.5183029476904694, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 18437 + }, + { + "epoch": 0.18438, + "grad_norm": 0.6287001273917971, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 18438 + }, + { + "epoch": 0.18439, + "grad_norm": 0.6527913564503991, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 18439 + }, + { + "epoch": 0.1844, + "grad_norm": 0.6537165185082999, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 18440 + }, + { + "epoch": 0.18441, + "grad_norm": 0.6109688303670382, + "learning_rate": 0.003, + "loss": 4.043, + "step": 18441 + }, + { + "epoch": 0.18442, + "grad_norm": 0.5807051392264547, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 18442 + }, + { + "epoch": 0.18443, + "grad_norm": 0.51909647450184, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 18443 + }, + { + "epoch": 0.18444, + "grad_norm": 0.6161715819722443, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 18444 + }, + { + "epoch": 0.18445, + "grad_norm": 0.686010218463515, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 18445 + }, + { + "epoch": 0.18446, + "grad_norm": 0.8357332711782991, + "learning_rate": 0.003, + "loss": 4.055, + "step": 18446 + }, + { + "epoch": 0.18447, + "grad_norm": 1.061025022993599, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 18447 + }, + { + "epoch": 0.18448, + "grad_norm": 1.0284687748521408, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 18448 + }, + { + "epoch": 0.18449, + "grad_norm": 0.9394082854203095, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 18449 + }, + { + "epoch": 0.1845, + "grad_norm": 0.8781725199666855, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 18450 + }, + { + "epoch": 0.18451, + "grad_norm": 0.8807996401925396, + "learning_rate": 0.003, + "loss": 4.064, + "step": 18451 + }, + { + "epoch": 0.18452, + "grad_norm": 0.9606994874096902, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 18452 + }, + { + "epoch": 0.18453, + "grad_norm": 0.9969082774520639, + "learning_rate": 0.003, + "loss": 4.103, + "step": 18453 + }, + { + "epoch": 0.18454, + "grad_norm": 1.1239948587452864, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 18454 + }, + { + "epoch": 0.18455, + "grad_norm": 0.9447925944530146, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 18455 + }, + { + "epoch": 0.18456, + "grad_norm": 0.8569234008384664, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 18456 + }, + { + "epoch": 0.18457, + "grad_norm": 0.8476602670449437, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 18457 + }, + { + "epoch": 0.18458, + "grad_norm": 0.9647356539077508, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 18458 + }, + { + "epoch": 0.18459, + "grad_norm": 1.1341728410551524, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 18459 + }, + { + "epoch": 0.1846, + "grad_norm": 0.9436184052763803, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 18460 + }, + { + "epoch": 0.18461, + "grad_norm": 1.0886259079988436, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 18461 + }, + { + "epoch": 0.18462, + "grad_norm": 1.1137114040374576, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 18462 + }, + { + "epoch": 0.18463, + "grad_norm": 0.888863614623211, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 18463 + }, + { + "epoch": 0.18464, + "grad_norm": 0.7955306254438335, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 18464 + }, + { + "epoch": 0.18465, + "grad_norm": 0.7948147222157742, + "learning_rate": 0.003, + "loss": 4.078, + "step": 18465 + }, + { + "epoch": 0.18466, + "grad_norm": 1.0093851830990055, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 18466 + }, + { + "epoch": 0.18467, + "grad_norm": 1.2687326355556912, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 18467 + }, + { + "epoch": 0.18468, + "grad_norm": 0.7723133455360601, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 18468 + }, + { + "epoch": 0.18469, + "grad_norm": 0.6975098229030329, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 18469 + }, + { + "epoch": 0.1847, + "grad_norm": 0.7467157201584129, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 18470 + }, + { + "epoch": 0.18471, + "grad_norm": 0.707265521656276, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 18471 + }, + { + "epoch": 0.18472, + "grad_norm": 0.7223176688893698, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 18472 + }, + { + "epoch": 0.18473, + "grad_norm": 0.7798563034331775, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 18473 + }, + { + "epoch": 0.18474, + "grad_norm": 0.8053796522646952, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 18474 + }, + { + "epoch": 0.18475, + "grad_norm": 0.8282204826481101, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 18475 + }, + { + "epoch": 0.18476, + "grad_norm": 0.8738331447423594, + "learning_rate": 0.003, + "loss": 4.066, + "step": 18476 + }, + { + "epoch": 0.18477, + "grad_norm": 0.8801572279090764, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 18477 + }, + { + "epoch": 0.18478, + "grad_norm": 1.0007657078182481, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 18478 + }, + { + "epoch": 0.18479, + "grad_norm": 1.1297510234974877, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 18479 + }, + { + "epoch": 0.1848, + "grad_norm": 0.93616625647756, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 18480 + }, + { + "epoch": 0.18481, + "grad_norm": 1.1069482850270802, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 18481 + }, + { + "epoch": 0.18482, + "grad_norm": 1.0328090219519728, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 18482 + }, + { + "epoch": 0.18483, + "grad_norm": 0.8304889646346205, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 18483 + }, + { + "epoch": 0.18484, + "grad_norm": 0.8306578210325865, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 18484 + }, + { + "epoch": 0.18485, + "grad_norm": 0.9084971459826292, + "learning_rate": 0.003, + "loss": 4.067, + "step": 18485 + }, + { + "epoch": 0.18486, + "grad_norm": 0.8508793939670165, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 18486 + }, + { + "epoch": 0.18487, + "grad_norm": 0.8456528685492818, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 18487 + }, + { + "epoch": 0.18488, + "grad_norm": 0.9793854587943929, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 18488 + }, + { + "epoch": 0.18489, + "grad_norm": 1.107666868486215, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 18489 + }, + { + "epoch": 0.1849, + "grad_norm": 0.9225095750991604, + "learning_rate": 0.003, + "loss": 4.088, + "step": 18490 + }, + { + "epoch": 0.18491, + "grad_norm": 0.8448100396851996, + "learning_rate": 0.003, + "loss": 4.066, + "step": 18491 + }, + { + "epoch": 0.18492, + "grad_norm": 0.8756577943950237, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 18492 + }, + { + "epoch": 0.18493, + "grad_norm": 0.921153383274438, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 18493 + }, + { + "epoch": 0.18494, + "grad_norm": 0.840848008704909, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 18494 + }, + { + "epoch": 0.18495, + "grad_norm": 0.7144620754955204, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 18495 + }, + { + "epoch": 0.18496, + "grad_norm": 0.7249052376451209, + "learning_rate": 0.003, + "loss": 4.018, + "step": 18496 + }, + { + "epoch": 0.18497, + "grad_norm": 0.7847380117299806, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 18497 + }, + { + "epoch": 0.18498, + "grad_norm": 0.9196205811049072, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 18498 + }, + { + "epoch": 0.18499, + "grad_norm": 0.9978465713301852, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 18499 + }, + { + "epoch": 0.185, + "grad_norm": 1.2254032374543695, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 18500 + }, + { + "epoch": 0.18501, + "grad_norm": 0.8662246686936604, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 18501 + }, + { + "epoch": 0.18502, + "grad_norm": 0.7970732484039922, + "learning_rate": 0.003, + "loss": 4.052, + "step": 18502 + }, + { + "epoch": 0.18503, + "grad_norm": 0.8844422038201806, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 18503 + }, + { + "epoch": 0.18504, + "grad_norm": 1.1654490981629433, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 18504 + }, + { + "epoch": 0.18505, + "grad_norm": 0.9050345806266741, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 18505 + }, + { + "epoch": 0.18506, + "grad_norm": 0.779394214768107, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 18506 + }, + { + "epoch": 0.18507, + "grad_norm": 0.862050962883863, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 18507 + }, + { + "epoch": 0.18508, + "grad_norm": 0.8815184232639309, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 18508 + }, + { + "epoch": 0.18509, + "grad_norm": 0.8056097550425132, + "learning_rate": 0.003, + "loss": 4.035, + "step": 18509 + }, + { + "epoch": 0.1851, + "grad_norm": 0.7791877804311869, + "learning_rate": 0.003, + "loss": 4.086, + "step": 18510 + }, + { + "epoch": 0.18511, + "grad_norm": 0.746159518453401, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 18511 + }, + { + "epoch": 0.18512, + "grad_norm": 0.7027773370904974, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 18512 + }, + { + "epoch": 0.18513, + "grad_norm": 0.7721353011637136, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 18513 + }, + { + "epoch": 0.18514, + "grad_norm": 0.8215571322591416, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 18514 + }, + { + "epoch": 0.18515, + "grad_norm": 0.9314042119284027, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 18515 + }, + { + "epoch": 0.18516, + "grad_norm": 1.0269123781707103, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 18516 + }, + { + "epoch": 0.18517, + "grad_norm": 1.0932754875226336, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 18517 + }, + { + "epoch": 0.18518, + "grad_norm": 0.9061094588281557, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 18518 + }, + { + "epoch": 0.18519, + "grad_norm": 0.9994184618025153, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 18519 + }, + { + "epoch": 0.1852, + "grad_norm": 0.9762699073612215, + "learning_rate": 0.003, + "loss": 4.05, + "step": 18520 + }, + { + "epoch": 0.18521, + "grad_norm": 0.9826633730146579, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 18521 + }, + { + "epoch": 0.18522, + "grad_norm": 0.9493219997439435, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 18522 + }, + { + "epoch": 0.18523, + "grad_norm": 0.8338856238760505, + "learning_rate": 0.003, + "loss": 4.044, + "step": 18523 + }, + { + "epoch": 0.18524, + "grad_norm": 0.6604945655333001, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 18524 + }, + { + "epoch": 0.18525, + "grad_norm": 0.602588548112755, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 18525 + }, + { + "epoch": 0.18526, + "grad_norm": 0.6890798022179152, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 18526 + }, + { + "epoch": 0.18527, + "grad_norm": 0.7583416468363614, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 18527 + }, + { + "epoch": 0.18528, + "grad_norm": 0.8064278412947958, + "learning_rate": 0.003, + "loss": 4.077, + "step": 18528 + }, + { + "epoch": 0.18529, + "grad_norm": 0.9415906924771318, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 18529 + }, + { + "epoch": 0.1853, + "grad_norm": 1.0054048758862657, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 18530 + }, + { + "epoch": 0.18531, + "grad_norm": 1.0521052734046925, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 18531 + }, + { + "epoch": 0.18532, + "grad_norm": 0.9634328372197097, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 18532 + }, + { + "epoch": 0.18533, + "grad_norm": 0.9808569150687576, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 18533 + }, + { + "epoch": 0.18534, + "grad_norm": 1.048165713144891, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 18534 + }, + { + "epoch": 0.18535, + "grad_norm": 1.0573097465128605, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 18535 + }, + { + "epoch": 0.18536, + "grad_norm": 1.063150165834106, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 18536 + }, + { + "epoch": 0.18537, + "grad_norm": 0.9292842474533076, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 18537 + }, + { + "epoch": 0.18538, + "grad_norm": 0.8128674410057475, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 18538 + }, + { + "epoch": 0.18539, + "grad_norm": 0.8341469754200896, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 18539 + }, + { + "epoch": 0.1854, + "grad_norm": 0.8838137436985168, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 18540 + }, + { + "epoch": 0.18541, + "grad_norm": 0.8878788892629679, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 18541 + }, + { + "epoch": 0.18542, + "grad_norm": 0.922069685170163, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 18542 + }, + { + "epoch": 0.18543, + "grad_norm": 0.865949105504782, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 18543 + }, + { + "epoch": 0.18544, + "grad_norm": 0.8314339821578145, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 18544 + }, + { + "epoch": 0.18545, + "grad_norm": 0.920204070855273, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 18545 + }, + { + "epoch": 0.18546, + "grad_norm": 1.0092756442902626, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 18546 + }, + { + "epoch": 0.18547, + "grad_norm": 1.1780330697843064, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 18547 + }, + { + "epoch": 0.18548, + "grad_norm": 0.9525007098326996, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 18548 + }, + { + "epoch": 0.18549, + "grad_norm": 0.8363546002359297, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 18549 + }, + { + "epoch": 0.1855, + "grad_norm": 0.8195504772440295, + "learning_rate": 0.003, + "loss": 4.07, + "step": 18550 + }, + { + "epoch": 0.18551, + "grad_norm": 0.7577289023619256, + "learning_rate": 0.003, + "loss": 4.059, + "step": 18551 + }, + { + "epoch": 0.18552, + "grad_norm": 0.8650761776571809, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 18552 + }, + { + "epoch": 0.18553, + "grad_norm": 0.8030292341735045, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 18553 + }, + { + "epoch": 0.18554, + "grad_norm": 0.8347437093497928, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 18554 + }, + { + "epoch": 0.18555, + "grad_norm": 0.978299065362135, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 18555 + }, + { + "epoch": 0.18556, + "grad_norm": 1.1173164361270036, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 18556 + }, + { + "epoch": 0.18557, + "grad_norm": 0.8360979444883614, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 18557 + }, + { + "epoch": 0.18558, + "grad_norm": 0.7582031849995656, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 18558 + }, + { + "epoch": 0.18559, + "grad_norm": 0.7068028381600463, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 18559 + }, + { + "epoch": 0.1856, + "grad_norm": 0.7180100367141031, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 18560 + }, + { + "epoch": 0.18561, + "grad_norm": 0.6840309184352409, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 18561 + }, + { + "epoch": 0.18562, + "grad_norm": 0.7694258088987754, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 18562 + }, + { + "epoch": 0.18563, + "grad_norm": 0.958104883986215, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 18563 + }, + { + "epoch": 0.18564, + "grad_norm": 1.4157366045585693, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 18564 + }, + { + "epoch": 0.18565, + "grad_norm": 0.5747070966442605, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 18565 + }, + { + "epoch": 0.18566, + "grad_norm": 0.6963956094707563, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 18566 + }, + { + "epoch": 0.18567, + "grad_norm": 0.7569404487630428, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 18567 + }, + { + "epoch": 0.18568, + "grad_norm": 0.7142714312803677, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 18568 + }, + { + "epoch": 0.18569, + "grad_norm": 0.7932754940972369, + "learning_rate": 0.003, + "loss": 4.063, + "step": 18569 + }, + { + "epoch": 0.1857, + "grad_norm": 0.948177045362133, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 18570 + }, + { + "epoch": 0.18571, + "grad_norm": 1.1651598379649453, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 18571 + }, + { + "epoch": 0.18572, + "grad_norm": 0.8348491979308896, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 18572 + }, + { + "epoch": 0.18573, + "grad_norm": 0.7978817295357757, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 18573 + }, + { + "epoch": 0.18574, + "grad_norm": 0.861319557730631, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 18574 + }, + { + "epoch": 0.18575, + "grad_norm": 0.8777951145807659, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 18575 + }, + { + "epoch": 0.18576, + "grad_norm": 0.7557209765978126, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 18576 + }, + { + "epoch": 0.18577, + "grad_norm": 0.7176279207797613, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 18577 + }, + { + "epoch": 0.18578, + "grad_norm": 0.8450161085980437, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 18578 + }, + { + "epoch": 0.18579, + "grad_norm": 1.0433029608767581, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 18579 + }, + { + "epoch": 0.1858, + "grad_norm": 1.0937265536703222, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 18580 + }, + { + "epoch": 0.18581, + "grad_norm": 1.0318135985630523, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 18581 + }, + { + "epoch": 0.18582, + "grad_norm": 1.1871668144742313, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 18582 + }, + { + "epoch": 0.18583, + "grad_norm": 0.9503706656273395, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 18583 + }, + { + "epoch": 0.18584, + "grad_norm": 0.8657144530670727, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 18584 + }, + { + "epoch": 0.18585, + "grad_norm": 0.804801902831209, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 18585 + }, + { + "epoch": 0.18586, + "grad_norm": 0.8820837687810786, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 18586 + }, + { + "epoch": 0.18587, + "grad_norm": 1.0015522090335942, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 18587 + }, + { + "epoch": 0.18588, + "grad_norm": 1.0145893090698617, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 18588 + }, + { + "epoch": 0.18589, + "grad_norm": 0.7714093426824449, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 18589 + }, + { + "epoch": 0.1859, + "grad_norm": 0.7239848046602954, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 18590 + }, + { + "epoch": 0.18591, + "grad_norm": 0.6753840091173711, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 18591 + }, + { + "epoch": 0.18592, + "grad_norm": 0.7673868892363467, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 18592 + }, + { + "epoch": 0.18593, + "grad_norm": 0.9701562480530804, + "learning_rate": 0.003, + "loss": 4.087, + "step": 18593 + }, + { + "epoch": 0.18594, + "grad_norm": 1.3944210066498526, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 18594 + }, + { + "epoch": 0.18595, + "grad_norm": 0.8126422871848954, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 18595 + }, + { + "epoch": 0.18596, + "grad_norm": 0.8016438186141875, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 18596 + }, + { + "epoch": 0.18597, + "grad_norm": 0.8534178146910221, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 18597 + }, + { + "epoch": 0.18598, + "grad_norm": 0.8008697451174429, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 18598 + }, + { + "epoch": 0.18599, + "grad_norm": 0.748727464666017, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 18599 + }, + { + "epoch": 0.186, + "grad_norm": 0.7970963092703048, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 18600 + }, + { + "epoch": 0.18601, + "grad_norm": 0.8937831658798338, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 18601 + }, + { + "epoch": 0.18602, + "grad_norm": 1.0473329983382251, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 18602 + }, + { + "epoch": 0.18603, + "grad_norm": 1.064094771613099, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 18603 + }, + { + "epoch": 0.18604, + "grad_norm": 1.1863735272555227, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 18604 + }, + { + "epoch": 0.18605, + "grad_norm": 0.818349618627902, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 18605 + }, + { + "epoch": 0.18606, + "grad_norm": 0.9333503419193867, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 18606 + }, + { + "epoch": 0.18607, + "grad_norm": 1.2111414666867866, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 18607 + }, + { + "epoch": 0.18608, + "grad_norm": 0.8497029535733186, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 18608 + }, + { + "epoch": 0.18609, + "grad_norm": 0.7137641796449244, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 18609 + }, + { + "epoch": 0.1861, + "grad_norm": 0.7932912505625324, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 18610 + }, + { + "epoch": 0.18611, + "grad_norm": 0.8232957366999208, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 18611 + }, + { + "epoch": 0.18612, + "grad_norm": 0.8075830264997964, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 18612 + }, + { + "epoch": 0.18613, + "grad_norm": 0.8771825778106802, + "learning_rate": 0.003, + "loss": 4.098, + "step": 18613 + }, + { + "epoch": 0.18614, + "grad_norm": 0.9470182671477181, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 18614 + }, + { + "epoch": 0.18615, + "grad_norm": 0.9258175891779046, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 18615 + }, + { + "epoch": 0.18616, + "grad_norm": 0.9036624713666295, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 18616 + }, + { + "epoch": 0.18617, + "grad_norm": 0.998954548989223, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 18617 + }, + { + "epoch": 0.18618, + "grad_norm": 0.9928413055052442, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 18618 + }, + { + "epoch": 0.18619, + "grad_norm": 0.9373758273689293, + "learning_rate": 0.003, + "loss": 4.061, + "step": 18619 + }, + { + "epoch": 0.1862, + "grad_norm": 0.9505850268826704, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 18620 + }, + { + "epoch": 0.18621, + "grad_norm": 0.9184104039486911, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 18621 + }, + { + "epoch": 0.18622, + "grad_norm": 0.7834062272402487, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 18622 + }, + { + "epoch": 0.18623, + "grad_norm": 0.8457309927041944, + "learning_rate": 0.003, + "loss": 4.057, + "step": 18623 + }, + { + "epoch": 0.18624, + "grad_norm": 0.773240324462519, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 18624 + }, + { + "epoch": 0.18625, + "grad_norm": 0.7758577710034719, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 18625 + }, + { + "epoch": 0.18626, + "grad_norm": 0.7474112197130057, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 18626 + }, + { + "epoch": 0.18627, + "grad_norm": 0.8009277134952273, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 18627 + }, + { + "epoch": 0.18628, + "grad_norm": 0.864150769355301, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 18628 + }, + { + "epoch": 0.18629, + "grad_norm": 0.9900300707558808, + "learning_rate": 0.003, + "loss": 4.054, + "step": 18629 + }, + { + "epoch": 0.1863, + "grad_norm": 1.1528805792552672, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 18630 + }, + { + "epoch": 0.18631, + "grad_norm": 0.9621551929838353, + "learning_rate": 0.003, + "loss": 4.091, + "step": 18631 + }, + { + "epoch": 0.18632, + "grad_norm": 1.0592645670992038, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 18632 + }, + { + "epoch": 0.18633, + "grad_norm": 1.0638430249249775, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 18633 + }, + { + "epoch": 0.18634, + "grad_norm": 0.8956163122694202, + "learning_rate": 0.003, + "loss": 4.069, + "step": 18634 + }, + { + "epoch": 0.18635, + "grad_norm": 0.9208634065672368, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 18635 + }, + { + "epoch": 0.18636, + "grad_norm": 0.8374672234311633, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 18636 + }, + { + "epoch": 0.18637, + "grad_norm": 1.0151809888599386, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 18637 + }, + { + "epoch": 0.18638, + "grad_norm": 1.2321944528000137, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 18638 + }, + { + "epoch": 0.18639, + "grad_norm": 0.6209810581818979, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 18639 + }, + { + "epoch": 0.1864, + "grad_norm": 0.6714859577579821, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 18640 + }, + { + "epoch": 0.18641, + "grad_norm": 0.7585800682226972, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 18641 + }, + { + "epoch": 0.18642, + "grad_norm": 0.7736416728996286, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 18642 + }, + { + "epoch": 0.18643, + "grad_norm": 0.6845587611141088, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 18643 + }, + { + "epoch": 0.18644, + "grad_norm": 0.6449457042942692, + "learning_rate": 0.003, + "loss": 4.04, + "step": 18644 + }, + { + "epoch": 0.18645, + "grad_norm": 0.6781658589648845, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 18645 + }, + { + "epoch": 0.18646, + "grad_norm": 0.7335346540476505, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 18646 + }, + { + "epoch": 0.18647, + "grad_norm": 0.9043414802065455, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 18647 + }, + { + "epoch": 0.18648, + "grad_norm": 1.115024472927529, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 18648 + }, + { + "epoch": 0.18649, + "grad_norm": 0.9442446230800574, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 18649 + }, + { + "epoch": 0.1865, + "grad_norm": 0.8640464791821411, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 18650 + }, + { + "epoch": 0.18651, + "grad_norm": 0.852620260969083, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 18651 + }, + { + "epoch": 0.18652, + "grad_norm": 0.8938138180833967, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 18652 + }, + { + "epoch": 0.18653, + "grad_norm": 0.9129665444894208, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 18653 + }, + { + "epoch": 0.18654, + "grad_norm": 0.9985517751187377, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 18654 + }, + { + "epoch": 0.18655, + "grad_norm": 1.114880208481357, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 18655 + }, + { + "epoch": 0.18656, + "grad_norm": 0.8585340599610687, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 18656 + }, + { + "epoch": 0.18657, + "grad_norm": 0.7283994937851266, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 18657 + }, + { + "epoch": 0.18658, + "grad_norm": 0.7626271075147254, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 18658 + }, + { + "epoch": 0.18659, + "grad_norm": 0.9814812835118157, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 18659 + }, + { + "epoch": 0.1866, + "grad_norm": 1.0757033001178842, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 18660 + }, + { + "epoch": 0.18661, + "grad_norm": 0.7884415209663592, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 18661 + }, + { + "epoch": 0.18662, + "grad_norm": 0.7979802653243565, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 18662 + }, + { + "epoch": 0.18663, + "grad_norm": 0.8907768455362602, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 18663 + }, + { + "epoch": 0.18664, + "grad_norm": 1.3466317763565474, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 18664 + }, + { + "epoch": 0.18665, + "grad_norm": 1.0636091644752153, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 18665 + }, + { + "epoch": 0.18666, + "grad_norm": 0.8427447620290612, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 18666 + }, + { + "epoch": 0.18667, + "grad_norm": 0.8992002508141136, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 18667 + }, + { + "epoch": 0.18668, + "grad_norm": 1.1605032609000148, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 18668 + }, + { + "epoch": 0.18669, + "grad_norm": 1.0041522367290034, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 18669 + }, + { + "epoch": 0.1867, + "grad_norm": 1.0570474655128794, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 18670 + }, + { + "epoch": 0.18671, + "grad_norm": 0.8562110161440691, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 18671 + }, + { + "epoch": 0.18672, + "grad_norm": 0.7916245608667062, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 18672 + }, + { + "epoch": 0.18673, + "grad_norm": 0.8174257597305178, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 18673 + }, + { + "epoch": 0.18674, + "grad_norm": 0.9890349710269822, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 18674 + }, + { + "epoch": 0.18675, + "grad_norm": 1.1489879870539976, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 18675 + }, + { + "epoch": 0.18676, + "grad_norm": 1.0579998447216608, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 18676 + }, + { + "epoch": 0.18677, + "grad_norm": 0.9422890963603554, + "learning_rate": 0.003, + "loss": 4.067, + "step": 18677 + }, + { + "epoch": 0.18678, + "grad_norm": 0.9266295073890203, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 18678 + }, + { + "epoch": 0.18679, + "grad_norm": 1.0229221223859826, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 18679 + }, + { + "epoch": 0.1868, + "grad_norm": 0.9790192128522515, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 18680 + }, + { + "epoch": 0.18681, + "grad_norm": 0.8227062142935673, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 18681 + }, + { + "epoch": 0.18682, + "grad_norm": 0.7613958240398305, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 18682 + }, + { + "epoch": 0.18683, + "grad_norm": 0.7971309870586542, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 18683 + }, + { + "epoch": 0.18684, + "grad_norm": 0.8694020388828363, + "learning_rate": 0.003, + "loss": 4.08, + "step": 18684 + }, + { + "epoch": 0.18685, + "grad_norm": 0.7547661441038792, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 18685 + }, + { + "epoch": 0.18686, + "grad_norm": 0.7586909918580507, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 18686 + }, + { + "epoch": 0.18687, + "grad_norm": 0.9244202442626519, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 18687 + }, + { + "epoch": 0.18688, + "grad_norm": 0.9714856248368594, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 18688 + }, + { + "epoch": 0.18689, + "grad_norm": 1.0033020555264711, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 18689 + }, + { + "epoch": 0.1869, + "grad_norm": 1.2299609613464744, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 18690 + }, + { + "epoch": 0.18691, + "grad_norm": 0.8317275090324072, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 18691 + }, + { + "epoch": 0.18692, + "grad_norm": 0.6599085621781208, + "learning_rate": 0.003, + "loss": 4.039, + "step": 18692 + }, + { + "epoch": 0.18693, + "grad_norm": 0.6042969686271215, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 18693 + }, + { + "epoch": 0.18694, + "grad_norm": 0.6085383113880665, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 18694 + }, + { + "epoch": 0.18695, + "grad_norm": 0.5585852976317263, + "learning_rate": 0.003, + "loss": 4.037, + "step": 18695 + }, + { + "epoch": 0.18696, + "grad_norm": 0.5403496937503798, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 18696 + }, + { + "epoch": 0.18697, + "grad_norm": 0.5333917672859838, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 18697 + }, + { + "epoch": 0.18698, + "grad_norm": 0.636439261340254, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 18698 + }, + { + "epoch": 0.18699, + "grad_norm": 0.8436122663873049, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 18699 + }, + { + "epoch": 0.187, + "grad_norm": 1.1818583809108116, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 18700 + }, + { + "epoch": 0.18701, + "grad_norm": 1.0260629147667248, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 18701 + }, + { + "epoch": 0.18702, + "grad_norm": 0.8223954752661594, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 18702 + }, + { + "epoch": 0.18703, + "grad_norm": 0.8137398214259036, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 18703 + }, + { + "epoch": 0.18704, + "grad_norm": 0.8143537813191083, + "learning_rate": 0.003, + "loss": 4.045, + "step": 18704 + }, + { + "epoch": 0.18705, + "grad_norm": 0.8892196928174321, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 18705 + }, + { + "epoch": 0.18706, + "grad_norm": 1.028766516821005, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 18706 + }, + { + "epoch": 0.18707, + "grad_norm": 0.9938237658131278, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 18707 + }, + { + "epoch": 0.18708, + "grad_norm": 1.1059708735089646, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 18708 + }, + { + "epoch": 0.18709, + "grad_norm": 1.0039294957107938, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 18709 + }, + { + "epoch": 0.1871, + "grad_norm": 0.9944472547123511, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 18710 + }, + { + "epoch": 0.18711, + "grad_norm": 0.9493386160201528, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 18711 + }, + { + "epoch": 0.18712, + "grad_norm": 0.8878470824962599, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 18712 + }, + { + "epoch": 0.18713, + "grad_norm": 0.8886403319100207, + "learning_rate": 0.003, + "loss": 4.075, + "step": 18713 + }, + { + "epoch": 0.18714, + "grad_norm": 1.0385483170221521, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 18714 + }, + { + "epoch": 0.18715, + "grad_norm": 1.079455300542097, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 18715 + }, + { + "epoch": 0.18716, + "grad_norm": 0.9226942336676118, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 18716 + }, + { + "epoch": 0.18717, + "grad_norm": 0.9031168481896467, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 18717 + }, + { + "epoch": 0.18718, + "grad_norm": 0.9145644943509552, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 18718 + }, + { + "epoch": 0.18719, + "grad_norm": 0.8901906264837669, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 18719 + }, + { + "epoch": 0.1872, + "grad_norm": 0.8823783083940833, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 18720 + }, + { + "epoch": 0.18721, + "grad_norm": 0.8741006416996544, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 18721 + }, + { + "epoch": 0.18722, + "grad_norm": 0.8771843269620443, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 18722 + }, + { + "epoch": 0.18723, + "grad_norm": 1.0859961396872917, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 18723 + }, + { + "epoch": 0.18724, + "grad_norm": 1.234783587381509, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 18724 + }, + { + "epoch": 0.18725, + "grad_norm": 0.8656584054193945, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 18725 + }, + { + "epoch": 0.18726, + "grad_norm": 1.0176295817640155, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 18726 + }, + { + "epoch": 0.18727, + "grad_norm": 1.0952543702855067, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 18727 + }, + { + "epoch": 0.18728, + "grad_norm": 0.9818200441142511, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 18728 + }, + { + "epoch": 0.18729, + "grad_norm": 0.8647280212517727, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 18729 + }, + { + "epoch": 0.1873, + "grad_norm": 0.7604727821073138, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 18730 + }, + { + "epoch": 0.18731, + "grad_norm": 0.8927783198152612, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 18731 + }, + { + "epoch": 0.18732, + "grad_norm": 1.1928710761756436, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 18732 + }, + { + "epoch": 0.18733, + "grad_norm": 1.0339184880306165, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 18733 + }, + { + "epoch": 0.18734, + "grad_norm": 0.9954957072543477, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 18734 + }, + { + "epoch": 0.18735, + "grad_norm": 1.0415461169977747, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 18735 + }, + { + "epoch": 0.18736, + "grad_norm": 1.0809808982084235, + "learning_rate": 0.003, + "loss": 4.085, + "step": 18736 + }, + { + "epoch": 0.18737, + "grad_norm": 0.9550940296303166, + "learning_rate": 0.003, + "loss": 4.095, + "step": 18737 + }, + { + "epoch": 0.18738, + "grad_norm": 0.8853494548839914, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 18738 + }, + { + "epoch": 0.18739, + "grad_norm": 0.8868575040078728, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 18739 + }, + { + "epoch": 0.1874, + "grad_norm": 0.8687924155211184, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 18740 + }, + { + "epoch": 0.18741, + "grad_norm": 1.0061975967728851, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 18741 + }, + { + "epoch": 0.18742, + "grad_norm": 1.0010206793135974, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 18742 + }, + { + "epoch": 0.18743, + "grad_norm": 0.9878387860859184, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 18743 + }, + { + "epoch": 0.18744, + "grad_norm": 0.9827023048517439, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 18744 + }, + { + "epoch": 0.18745, + "grad_norm": 0.932708005871291, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 18745 + }, + { + "epoch": 0.18746, + "grad_norm": 0.8068614961020573, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 18746 + }, + { + "epoch": 0.18747, + "grad_norm": 0.7385578015805192, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 18747 + }, + { + "epoch": 0.18748, + "grad_norm": 0.5711167269593707, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 18748 + }, + { + "epoch": 0.18749, + "grad_norm": 0.5294477523722143, + "learning_rate": 0.003, + "loss": 4.056, + "step": 18749 + }, + { + "epoch": 0.1875, + "grad_norm": 0.5140437975886389, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 18750 + }, + { + "epoch": 0.18751, + "grad_norm": 0.530468961475268, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 18751 + }, + { + "epoch": 0.18752, + "grad_norm": 0.5645708108749334, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 18752 + }, + { + "epoch": 0.18753, + "grad_norm": 0.675429752564956, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 18753 + }, + { + "epoch": 0.18754, + "grad_norm": 0.6639880753127896, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 18754 + }, + { + "epoch": 0.18755, + "grad_norm": 0.5432681051998008, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 18755 + }, + { + "epoch": 0.18756, + "grad_norm": 0.5940936271651334, + "learning_rate": 0.003, + "loss": 4.06, + "step": 18756 + }, + { + "epoch": 0.18757, + "grad_norm": 0.7237507616753347, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 18757 + }, + { + "epoch": 0.18758, + "grad_norm": 0.7555555056002169, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 18758 + }, + { + "epoch": 0.18759, + "grad_norm": 0.891224620725597, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 18759 + }, + { + "epoch": 0.1876, + "grad_norm": 1.3203251452776854, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 18760 + }, + { + "epoch": 0.18761, + "grad_norm": 1.0028429772376184, + "learning_rate": 0.003, + "loss": 4.053, + "step": 18761 + }, + { + "epoch": 0.18762, + "grad_norm": 1.0536848032883817, + "learning_rate": 0.003, + "loss": 4.083, + "step": 18762 + }, + { + "epoch": 0.18763, + "grad_norm": 0.8930248954359878, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 18763 + }, + { + "epoch": 0.18764, + "grad_norm": 0.9711429735798429, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 18764 + }, + { + "epoch": 0.18765, + "grad_norm": 0.9549252602224277, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 18765 + }, + { + "epoch": 0.18766, + "grad_norm": 0.8825744979988466, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 18766 + }, + { + "epoch": 0.18767, + "grad_norm": 0.866613538140772, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 18767 + }, + { + "epoch": 0.18768, + "grad_norm": 0.9927590475619487, + "learning_rate": 0.003, + "loss": 4.061, + "step": 18768 + }, + { + "epoch": 0.18769, + "grad_norm": 1.0088609322533033, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 18769 + }, + { + "epoch": 0.1877, + "grad_norm": 1.106423781881455, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 18770 + }, + { + "epoch": 0.18771, + "grad_norm": 1.034431665338578, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 18771 + }, + { + "epoch": 0.18772, + "grad_norm": 1.0627262986881796, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 18772 + }, + { + "epoch": 0.18773, + "grad_norm": 0.887195522054885, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 18773 + }, + { + "epoch": 0.18774, + "grad_norm": 0.858918141371021, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 18774 + }, + { + "epoch": 0.18775, + "grad_norm": 0.8239609561464694, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 18775 + }, + { + "epoch": 0.18776, + "grad_norm": 0.765249461107146, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 18776 + }, + { + "epoch": 0.18777, + "grad_norm": 0.7284087557981171, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 18777 + }, + { + "epoch": 0.18778, + "grad_norm": 0.7478154263010988, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 18778 + }, + { + "epoch": 0.18779, + "grad_norm": 0.8587254952613894, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 18779 + }, + { + "epoch": 0.1878, + "grad_norm": 1.060230551436364, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 18780 + }, + { + "epoch": 0.18781, + "grad_norm": 0.9978506452509432, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 18781 + }, + { + "epoch": 0.18782, + "grad_norm": 0.9835383501396353, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 18782 + }, + { + "epoch": 0.18783, + "grad_norm": 1.1044040436391112, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 18783 + }, + { + "epoch": 0.18784, + "grad_norm": 0.9758066096693506, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 18784 + }, + { + "epoch": 0.18785, + "grad_norm": 0.9497832993559777, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 18785 + }, + { + "epoch": 0.18786, + "grad_norm": 0.9153335118379946, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 18786 + }, + { + "epoch": 0.18787, + "grad_norm": 0.9234378938531879, + "learning_rate": 0.003, + "loss": 4.059, + "step": 18787 + }, + { + "epoch": 0.18788, + "grad_norm": 0.9706677008830245, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 18788 + }, + { + "epoch": 0.18789, + "grad_norm": 1.0192850757926157, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 18789 + }, + { + "epoch": 0.1879, + "grad_norm": 0.9550680647462348, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 18790 + }, + { + "epoch": 0.18791, + "grad_norm": 0.9308141125746637, + "learning_rate": 0.003, + "loss": 4.041, + "step": 18791 + }, + { + "epoch": 0.18792, + "grad_norm": 0.9191175824382493, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 18792 + }, + { + "epoch": 0.18793, + "grad_norm": 0.9020403126675917, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 18793 + }, + { + "epoch": 0.18794, + "grad_norm": 0.7026790245672965, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 18794 + }, + { + "epoch": 0.18795, + "grad_norm": 0.7463574707950998, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 18795 + }, + { + "epoch": 0.18796, + "grad_norm": 0.775705222498586, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 18796 + }, + { + "epoch": 0.18797, + "grad_norm": 0.860465009667747, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 18797 + }, + { + "epoch": 0.18798, + "grad_norm": 0.8729114215281945, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 18798 + }, + { + "epoch": 0.18799, + "grad_norm": 0.8493526673144045, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 18799 + }, + { + "epoch": 0.188, + "grad_norm": 0.8414683201108558, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 18800 + }, + { + "epoch": 0.18801, + "grad_norm": 0.7109917521508585, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 18801 + }, + { + "epoch": 0.18802, + "grad_norm": 0.7464302518820626, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 18802 + }, + { + "epoch": 0.18803, + "grad_norm": 0.9287888400318897, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 18803 + }, + { + "epoch": 0.18804, + "grad_norm": 0.9558074591311403, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 18804 + }, + { + "epoch": 0.18805, + "grad_norm": 1.131489662866921, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 18805 + }, + { + "epoch": 0.18806, + "grad_norm": 1.0499753946196504, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 18806 + }, + { + "epoch": 0.18807, + "grad_norm": 0.9022586354003591, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 18807 + }, + { + "epoch": 0.18808, + "grad_norm": 0.9819719640203605, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 18808 + }, + { + "epoch": 0.18809, + "grad_norm": 0.8317694849570995, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 18809 + }, + { + "epoch": 0.1881, + "grad_norm": 0.8069785912305021, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 18810 + }, + { + "epoch": 0.18811, + "grad_norm": 0.9416952608965097, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 18811 + }, + { + "epoch": 0.18812, + "grad_norm": 1.0691887778931077, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 18812 + }, + { + "epoch": 0.18813, + "grad_norm": 0.9226958543712647, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 18813 + }, + { + "epoch": 0.18814, + "grad_norm": 0.9931817208848512, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 18814 + }, + { + "epoch": 0.18815, + "grad_norm": 1.2125783717525978, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 18815 + }, + { + "epoch": 0.18816, + "grad_norm": 0.8120664996097132, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 18816 + }, + { + "epoch": 0.18817, + "grad_norm": 0.6353000270082406, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 18817 + }, + { + "epoch": 0.18818, + "grad_norm": 0.6002060133252773, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 18818 + }, + { + "epoch": 0.18819, + "grad_norm": 0.6266331901580718, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 18819 + }, + { + "epoch": 0.1882, + "grad_norm": 0.5770467050684869, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 18820 + }, + { + "epoch": 0.18821, + "grad_norm": 0.5613509305690385, + "learning_rate": 0.003, + "loss": 4.072, + "step": 18821 + }, + { + "epoch": 0.18822, + "grad_norm": 0.5150651484254392, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 18822 + }, + { + "epoch": 0.18823, + "grad_norm": 0.5568903900998636, + "learning_rate": 0.003, + "loss": 4.047, + "step": 18823 + }, + { + "epoch": 0.18824, + "grad_norm": 0.7060667885894555, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 18824 + }, + { + "epoch": 0.18825, + "grad_norm": 0.9063872184032593, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 18825 + }, + { + "epoch": 0.18826, + "grad_norm": 1.2178790678775449, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 18826 + }, + { + "epoch": 0.18827, + "grad_norm": 0.7594558247508345, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 18827 + }, + { + "epoch": 0.18828, + "grad_norm": 0.6593887365400035, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 18828 + }, + { + "epoch": 0.18829, + "grad_norm": 0.6966940622137495, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 18829 + }, + { + "epoch": 0.1883, + "grad_norm": 0.6614036416998257, + "learning_rate": 0.003, + "loss": 4.054, + "step": 18830 + }, + { + "epoch": 0.18831, + "grad_norm": 0.715301254672398, + "learning_rate": 0.003, + "loss": 4.028, + "step": 18831 + }, + { + "epoch": 0.18832, + "grad_norm": 0.781847676703661, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 18832 + }, + { + "epoch": 0.18833, + "grad_norm": 0.7080776058645224, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 18833 + }, + { + "epoch": 0.18834, + "grad_norm": 0.7356251735610517, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 18834 + }, + { + "epoch": 0.18835, + "grad_norm": 0.7430063733504297, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 18835 + }, + { + "epoch": 0.18836, + "grad_norm": 0.8891537651152955, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 18836 + }, + { + "epoch": 0.18837, + "grad_norm": 1.0556734859556318, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 18837 + }, + { + "epoch": 0.18838, + "grad_norm": 1.1679233135151377, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 18838 + }, + { + "epoch": 0.18839, + "grad_norm": 0.8264179660628502, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 18839 + }, + { + "epoch": 0.1884, + "grad_norm": 0.7098483265548313, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 18840 + }, + { + "epoch": 0.18841, + "grad_norm": 0.8054116842630247, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 18841 + }, + { + "epoch": 0.18842, + "grad_norm": 0.8509981670241051, + "learning_rate": 0.003, + "loss": 4.063, + "step": 18842 + }, + { + "epoch": 0.18843, + "grad_norm": 0.9969852262785717, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 18843 + }, + { + "epoch": 0.18844, + "grad_norm": 1.1413452406387146, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 18844 + }, + { + "epoch": 0.18845, + "grad_norm": 0.872671479987583, + "learning_rate": 0.003, + "loss": 4.056, + "step": 18845 + }, + { + "epoch": 0.18846, + "grad_norm": 0.9264950201075592, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 18846 + }, + { + "epoch": 0.18847, + "grad_norm": 1.4893856769578935, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 18847 + }, + { + "epoch": 0.18848, + "grad_norm": 0.8722870420523543, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 18848 + }, + { + "epoch": 0.18849, + "grad_norm": 0.8718515176998175, + "learning_rate": 0.003, + "loss": 4.071, + "step": 18849 + }, + { + "epoch": 0.1885, + "grad_norm": 1.239750137077456, + "learning_rate": 0.003, + "loss": 4.078, + "step": 18850 + }, + { + "epoch": 0.18851, + "grad_norm": 1.085094743759517, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 18851 + }, + { + "epoch": 0.18852, + "grad_norm": 0.9221287522489386, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 18852 + }, + { + "epoch": 0.18853, + "grad_norm": 0.9790516636979539, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 18853 + }, + { + "epoch": 0.18854, + "grad_norm": 1.0851939061736167, + "learning_rate": 0.003, + "loss": 4.093, + "step": 18854 + }, + { + "epoch": 0.18855, + "grad_norm": 0.8143372006760771, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 18855 + }, + { + "epoch": 0.18856, + "grad_norm": 0.8526868039812159, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 18856 + }, + { + "epoch": 0.18857, + "grad_norm": 0.934240988277304, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 18857 + }, + { + "epoch": 0.18858, + "grad_norm": 1.059774404285563, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 18858 + }, + { + "epoch": 0.18859, + "grad_norm": 1.3744112044009085, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 18859 + }, + { + "epoch": 0.1886, + "grad_norm": 0.838191945956162, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 18860 + }, + { + "epoch": 0.18861, + "grad_norm": 0.9567723189309212, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 18861 + }, + { + "epoch": 0.18862, + "grad_norm": 0.9522209163282251, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 18862 + }, + { + "epoch": 0.18863, + "grad_norm": 0.9701645634719633, + "learning_rate": 0.003, + "loss": 4.048, + "step": 18863 + }, + { + "epoch": 0.18864, + "grad_norm": 0.8867468108089815, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 18864 + }, + { + "epoch": 0.18865, + "grad_norm": 0.9147378691920581, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 18865 + }, + { + "epoch": 0.18866, + "grad_norm": 0.9408687642786978, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 18866 + }, + { + "epoch": 0.18867, + "grad_norm": 0.8335230506646315, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 18867 + }, + { + "epoch": 0.18868, + "grad_norm": 0.8060490775288223, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 18868 + }, + { + "epoch": 0.18869, + "grad_norm": 0.970622923617127, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 18869 + }, + { + "epoch": 0.1887, + "grad_norm": 1.1854363082597448, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 18870 + }, + { + "epoch": 0.18871, + "grad_norm": 0.8204597507089216, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 18871 + }, + { + "epoch": 0.18872, + "grad_norm": 0.7326807644564894, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 18872 + }, + { + "epoch": 0.18873, + "grad_norm": 0.7012234089239755, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 18873 + }, + { + "epoch": 0.18874, + "grad_norm": 0.6900292394829466, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 18874 + }, + { + "epoch": 0.18875, + "grad_norm": 0.6939154791718721, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 18875 + }, + { + "epoch": 0.18876, + "grad_norm": 0.7517473133778547, + "learning_rate": 0.003, + "loss": 4.056, + "step": 18876 + }, + { + "epoch": 0.18877, + "grad_norm": 0.8191317178861717, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 18877 + }, + { + "epoch": 0.18878, + "grad_norm": 0.8538127620028799, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 18878 + }, + { + "epoch": 0.18879, + "grad_norm": 0.932832724439279, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 18879 + }, + { + "epoch": 0.1888, + "grad_norm": 1.092720141013755, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 18880 + }, + { + "epoch": 0.18881, + "grad_norm": 1.242970126766121, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 18881 + }, + { + "epoch": 0.18882, + "grad_norm": 0.6911935609097055, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 18882 + }, + { + "epoch": 0.18883, + "grad_norm": 0.5855532035751623, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 18883 + }, + { + "epoch": 0.18884, + "grad_norm": 0.5788839021366634, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 18884 + }, + { + "epoch": 0.18885, + "grad_norm": 0.6723976432780131, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 18885 + }, + { + "epoch": 0.18886, + "grad_norm": 0.9613490012287368, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 18886 + }, + { + "epoch": 0.18887, + "grad_norm": 1.2156570563802014, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 18887 + }, + { + "epoch": 0.18888, + "grad_norm": 0.7026564730680053, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 18888 + }, + { + "epoch": 0.18889, + "grad_norm": 0.637218552771614, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 18889 + }, + { + "epoch": 0.1889, + "grad_norm": 0.7848841726408939, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 18890 + }, + { + "epoch": 0.18891, + "grad_norm": 0.8174618056874458, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 18891 + }, + { + "epoch": 0.18892, + "grad_norm": 0.82324037796606, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 18892 + }, + { + "epoch": 0.18893, + "grad_norm": 0.8989579567688616, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 18893 + }, + { + "epoch": 0.18894, + "grad_norm": 0.8170069921172262, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 18894 + }, + { + "epoch": 0.18895, + "grad_norm": 0.8709899818471913, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 18895 + }, + { + "epoch": 0.18896, + "grad_norm": 0.9816936074435, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 18896 + }, + { + "epoch": 0.18897, + "grad_norm": 0.9385300050005491, + "learning_rate": 0.003, + "loss": 4.043, + "step": 18897 + }, + { + "epoch": 0.18898, + "grad_norm": 1.0164258971081395, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 18898 + }, + { + "epoch": 0.18899, + "grad_norm": 1.2353664741512713, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 18899 + }, + { + "epoch": 0.189, + "grad_norm": 0.8115214863222378, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 18900 + }, + { + "epoch": 0.18901, + "grad_norm": 0.8702930207318287, + "learning_rate": 0.003, + "loss": 4.06, + "step": 18901 + }, + { + "epoch": 0.18902, + "grad_norm": 0.9074106996674826, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 18902 + }, + { + "epoch": 0.18903, + "grad_norm": 0.8769430132728124, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 18903 + }, + { + "epoch": 0.18904, + "grad_norm": 0.8357201714124332, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 18904 + }, + { + "epoch": 0.18905, + "grad_norm": 0.8568778260748834, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 18905 + }, + { + "epoch": 0.18906, + "grad_norm": 0.8773830592359707, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 18906 + }, + { + "epoch": 0.18907, + "grad_norm": 1.1018615697908245, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 18907 + }, + { + "epoch": 0.18908, + "grad_norm": 1.0768350176206178, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 18908 + }, + { + "epoch": 0.18909, + "grad_norm": 1.02958642003902, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 18909 + }, + { + "epoch": 0.1891, + "grad_norm": 1.109531743082689, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 18910 + }, + { + "epoch": 0.18911, + "grad_norm": 1.0706380059324208, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 18911 + }, + { + "epoch": 0.18912, + "grad_norm": 0.9401072080081331, + "learning_rate": 0.003, + "loss": 4.081, + "step": 18912 + }, + { + "epoch": 0.18913, + "grad_norm": 0.9471800113328755, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 18913 + }, + { + "epoch": 0.18914, + "grad_norm": 1.03021998339916, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 18914 + }, + { + "epoch": 0.18915, + "grad_norm": 0.9926485053115832, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 18915 + }, + { + "epoch": 0.18916, + "grad_norm": 0.8649525785747464, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 18916 + }, + { + "epoch": 0.18917, + "grad_norm": 0.8716473557338101, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 18917 + }, + { + "epoch": 0.18918, + "grad_norm": 0.7974051544805679, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 18918 + }, + { + "epoch": 0.18919, + "grad_norm": 0.7486120150741057, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 18919 + }, + { + "epoch": 0.1892, + "grad_norm": 0.7471681148128011, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 18920 + }, + { + "epoch": 0.18921, + "grad_norm": 0.8481416100666542, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 18921 + }, + { + "epoch": 0.18922, + "grad_norm": 1.05609494877661, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 18922 + }, + { + "epoch": 0.18923, + "grad_norm": 1.0960752489006915, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 18923 + }, + { + "epoch": 0.18924, + "grad_norm": 0.7967767649772057, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 18924 + }, + { + "epoch": 0.18925, + "grad_norm": 0.7933314767515339, + "learning_rate": 0.003, + "loss": 4.047, + "step": 18925 + }, + { + "epoch": 0.18926, + "grad_norm": 0.8148930991589804, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 18926 + }, + { + "epoch": 0.18927, + "grad_norm": 1.0041858509230297, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 18927 + }, + { + "epoch": 0.18928, + "grad_norm": 1.250085685550684, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 18928 + }, + { + "epoch": 0.18929, + "grad_norm": 0.9377175281824044, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 18929 + }, + { + "epoch": 0.1893, + "grad_norm": 0.9908659703128931, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 18930 + }, + { + "epoch": 0.18931, + "grad_norm": 1.0835707397021723, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 18931 + }, + { + "epoch": 0.18932, + "grad_norm": 0.8962878343047594, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 18932 + }, + { + "epoch": 0.18933, + "grad_norm": 0.8173775100193572, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 18933 + }, + { + "epoch": 0.18934, + "grad_norm": 0.8729611047696794, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 18934 + }, + { + "epoch": 0.18935, + "grad_norm": 0.8113040778480568, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 18935 + }, + { + "epoch": 0.18936, + "grad_norm": 0.7988674454618783, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 18936 + }, + { + "epoch": 0.18937, + "grad_norm": 0.8254437408286549, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 18937 + }, + { + "epoch": 0.18938, + "grad_norm": 0.9520101741877295, + "learning_rate": 0.003, + "loss": 4.073, + "step": 18938 + }, + { + "epoch": 0.18939, + "grad_norm": 0.8857240409846306, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 18939 + }, + { + "epoch": 0.1894, + "grad_norm": 0.88749937816289, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 18940 + }, + { + "epoch": 0.18941, + "grad_norm": 1.1227583098496423, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 18941 + }, + { + "epoch": 0.18942, + "grad_norm": 0.9832114707116061, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 18942 + }, + { + "epoch": 0.18943, + "grad_norm": 0.9600468349870453, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 18943 + }, + { + "epoch": 0.18944, + "grad_norm": 0.8732775554293952, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 18944 + }, + { + "epoch": 0.18945, + "grad_norm": 0.7220890422982016, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 18945 + }, + { + "epoch": 0.18946, + "grad_norm": 0.6903304654162348, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 18946 + }, + { + "epoch": 0.18947, + "grad_norm": 0.6762756345509435, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 18947 + }, + { + "epoch": 0.18948, + "grad_norm": 0.6460604076303211, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 18948 + }, + { + "epoch": 0.18949, + "grad_norm": 0.6329574927541973, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 18949 + }, + { + "epoch": 0.1895, + "grad_norm": 0.6836089872978555, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 18950 + }, + { + "epoch": 0.18951, + "grad_norm": 0.7086644898134858, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 18951 + }, + { + "epoch": 0.18952, + "grad_norm": 0.7100978117855653, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 18952 + }, + { + "epoch": 0.18953, + "grad_norm": 0.7871203746614859, + "learning_rate": 0.003, + "loss": 4.052, + "step": 18953 + }, + { + "epoch": 0.18954, + "grad_norm": 0.8469184527638219, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 18954 + }, + { + "epoch": 0.18955, + "grad_norm": 1.0414248711497491, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 18955 + }, + { + "epoch": 0.18956, + "grad_norm": 1.3506103140542016, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 18956 + }, + { + "epoch": 0.18957, + "grad_norm": 0.6334204222274756, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 18957 + }, + { + "epoch": 0.18958, + "grad_norm": 0.6643756571034968, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 18958 + }, + { + "epoch": 0.18959, + "grad_norm": 0.7045097358355302, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 18959 + }, + { + "epoch": 0.1896, + "grad_norm": 0.7453331975404801, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 18960 + }, + { + "epoch": 0.18961, + "grad_norm": 0.8504204787573448, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 18961 + }, + { + "epoch": 0.18962, + "grad_norm": 1.013284928088702, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 18962 + }, + { + "epoch": 0.18963, + "grad_norm": 1.3091718330381101, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 18963 + }, + { + "epoch": 0.18964, + "grad_norm": 0.6574366733188126, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 18964 + }, + { + "epoch": 0.18965, + "grad_norm": 0.6169896902961652, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 18965 + }, + { + "epoch": 0.18966, + "grad_norm": 0.670031144439429, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 18966 + }, + { + "epoch": 0.18967, + "grad_norm": 0.6229251933542171, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 18967 + }, + { + "epoch": 0.18968, + "grad_norm": 0.6948496321264902, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 18968 + }, + { + "epoch": 0.18969, + "grad_norm": 0.8592221207104161, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 18969 + }, + { + "epoch": 0.1897, + "grad_norm": 0.9751045037276446, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 18970 + }, + { + "epoch": 0.18971, + "grad_norm": 1.086102405373793, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 18971 + }, + { + "epoch": 0.18972, + "grad_norm": 1.0940275559044008, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 18972 + }, + { + "epoch": 0.18973, + "grad_norm": 1.1265406620659937, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 18973 + }, + { + "epoch": 0.18974, + "grad_norm": 1.0123078102853043, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 18974 + }, + { + "epoch": 0.18975, + "grad_norm": 0.9758868356065785, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 18975 + }, + { + "epoch": 0.18976, + "grad_norm": 0.9080524279516271, + "learning_rate": 0.003, + "loss": 4.095, + "step": 18976 + }, + { + "epoch": 0.18977, + "grad_norm": 0.9830345863691641, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 18977 + }, + { + "epoch": 0.18978, + "grad_norm": 1.177799175192337, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 18978 + }, + { + "epoch": 0.18979, + "grad_norm": 1.0274277802455434, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 18979 + }, + { + "epoch": 0.1898, + "grad_norm": 1.02455739441151, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 18980 + }, + { + "epoch": 0.18981, + "grad_norm": 1.2845174388375342, + "learning_rate": 0.003, + "loss": 4.081, + "step": 18981 + }, + { + "epoch": 0.18982, + "grad_norm": 0.971770146010959, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 18982 + }, + { + "epoch": 0.18983, + "grad_norm": 0.9270480670885711, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 18983 + }, + { + "epoch": 0.18984, + "grad_norm": 0.9218668106555503, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 18984 + }, + { + "epoch": 0.18985, + "grad_norm": 0.868318247439268, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 18985 + }, + { + "epoch": 0.18986, + "grad_norm": 0.890714238448063, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 18986 + }, + { + "epoch": 0.18987, + "grad_norm": 0.9555878528790877, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 18987 + }, + { + "epoch": 0.18988, + "grad_norm": 1.0551855156304657, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 18988 + }, + { + "epoch": 0.18989, + "grad_norm": 1.12462282757074, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 18989 + }, + { + "epoch": 0.1899, + "grad_norm": 1.038697462277725, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 18990 + }, + { + "epoch": 0.18991, + "grad_norm": 1.1496875897116776, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 18991 + }, + { + "epoch": 0.18992, + "grad_norm": 1.1227898373814509, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 18992 + }, + { + "epoch": 0.18993, + "grad_norm": 0.7791332709741187, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 18993 + }, + { + "epoch": 0.18994, + "grad_norm": 0.5527281179829645, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 18994 + }, + { + "epoch": 0.18995, + "grad_norm": 0.604223751927832, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 18995 + }, + { + "epoch": 0.18996, + "grad_norm": 0.6747532465704158, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 18996 + }, + { + "epoch": 0.18997, + "grad_norm": 0.7207582894714029, + "learning_rate": 0.003, + "loss": 4.068, + "step": 18997 + }, + { + "epoch": 0.18998, + "grad_norm": 0.7449484313550031, + "learning_rate": 0.003, + "loss": 4.065, + "step": 18998 + }, + { + "epoch": 0.18999, + "grad_norm": 0.8533927002613969, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 18999 + }, + { + "epoch": 0.19, + "grad_norm": 0.9778404124788905, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 19000 + }, + { + "epoch": 0.19001, + "grad_norm": 1.1364391339030848, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 19001 + }, + { + "epoch": 0.19002, + "grad_norm": 0.6695573143213839, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 19002 + }, + { + "epoch": 0.19003, + "grad_norm": 0.6048424395466192, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 19003 + }, + { + "epoch": 0.19004, + "grad_norm": 0.6115529014494826, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 19004 + }, + { + "epoch": 0.19005, + "grad_norm": 0.5770869656635091, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 19005 + }, + { + "epoch": 0.19006, + "grad_norm": 0.5873184325931075, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 19006 + }, + { + "epoch": 0.19007, + "grad_norm": 0.6284217240533744, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 19007 + }, + { + "epoch": 0.19008, + "grad_norm": 0.6135687901151156, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 19008 + }, + { + "epoch": 0.19009, + "grad_norm": 0.6565765096883247, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 19009 + }, + { + "epoch": 0.1901, + "grad_norm": 0.7769475849315273, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 19010 + }, + { + "epoch": 0.19011, + "grad_norm": 0.8040936368011172, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 19011 + }, + { + "epoch": 0.19012, + "grad_norm": 0.8936636379105999, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 19012 + }, + { + "epoch": 0.19013, + "grad_norm": 0.9324952613877908, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 19013 + }, + { + "epoch": 0.19014, + "grad_norm": 1.13754471117426, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 19014 + }, + { + "epoch": 0.19015, + "grad_norm": 1.1953709908335939, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 19015 + }, + { + "epoch": 0.19016, + "grad_norm": 0.8246736475318042, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 19016 + }, + { + "epoch": 0.19017, + "grad_norm": 0.777573909964225, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 19017 + }, + { + "epoch": 0.19018, + "grad_norm": 0.9216623413003302, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 19018 + }, + { + "epoch": 0.19019, + "grad_norm": 1.0944538370069077, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 19019 + }, + { + "epoch": 0.1902, + "grad_norm": 0.9709440838124774, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 19020 + }, + { + "epoch": 0.19021, + "grad_norm": 0.9160250481035979, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 19021 + }, + { + "epoch": 0.19022, + "grad_norm": 0.9012521941774062, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 19022 + }, + { + "epoch": 0.19023, + "grad_norm": 0.8770374884453678, + "learning_rate": 0.003, + "loss": 4.106, + "step": 19023 + }, + { + "epoch": 0.19024, + "grad_norm": 0.9632582548635487, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 19024 + }, + { + "epoch": 0.19025, + "grad_norm": 0.9475115024002957, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 19025 + }, + { + "epoch": 0.19026, + "grad_norm": 1.0989703319915722, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 19026 + }, + { + "epoch": 0.19027, + "grad_norm": 1.1497766262185596, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 19027 + }, + { + "epoch": 0.19028, + "grad_norm": 0.9571602279212525, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 19028 + }, + { + "epoch": 0.19029, + "grad_norm": 1.0474889964290843, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 19029 + }, + { + "epoch": 0.1903, + "grad_norm": 1.0323725074430252, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 19030 + }, + { + "epoch": 0.19031, + "grad_norm": 0.8808059545512241, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 19031 + }, + { + "epoch": 0.19032, + "grad_norm": 0.7104999259230595, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 19032 + }, + { + "epoch": 0.19033, + "grad_norm": 0.7214429505155346, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 19033 + }, + { + "epoch": 0.19034, + "grad_norm": 0.8412580038537382, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 19034 + }, + { + "epoch": 0.19035, + "grad_norm": 1.0452842487812224, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 19035 + }, + { + "epoch": 0.19036, + "grad_norm": 1.079435052990799, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 19036 + }, + { + "epoch": 0.19037, + "grad_norm": 0.9397425655569261, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 19037 + }, + { + "epoch": 0.19038, + "grad_norm": 0.9761474114736021, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 19038 + }, + { + "epoch": 0.19039, + "grad_norm": 0.8420411401715001, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 19039 + }, + { + "epoch": 0.1904, + "grad_norm": 0.77521773509717, + "learning_rate": 0.003, + "loss": 4.036, + "step": 19040 + }, + { + "epoch": 0.19041, + "grad_norm": 0.8430973686243493, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 19041 + }, + { + "epoch": 0.19042, + "grad_norm": 1.0806468455160583, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 19042 + }, + { + "epoch": 0.19043, + "grad_norm": 1.123342672423004, + "learning_rate": 0.003, + "loss": 4.082, + "step": 19043 + }, + { + "epoch": 0.19044, + "grad_norm": 0.7778214511480973, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 19044 + }, + { + "epoch": 0.19045, + "grad_norm": 0.7600183814017205, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 19045 + }, + { + "epoch": 0.19046, + "grad_norm": 0.7241825916248691, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 19046 + }, + { + "epoch": 0.19047, + "grad_norm": 0.8160897029473771, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 19047 + }, + { + "epoch": 0.19048, + "grad_norm": 0.8226404474133778, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 19048 + }, + { + "epoch": 0.19049, + "grad_norm": 0.8002548929155533, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 19049 + }, + { + "epoch": 0.1905, + "grad_norm": 0.8369692255230874, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 19050 + }, + { + "epoch": 0.19051, + "grad_norm": 0.9504811808648432, + "learning_rate": 0.003, + "loss": 4.067, + "step": 19051 + }, + { + "epoch": 0.19052, + "grad_norm": 0.9110330922214464, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 19052 + }, + { + "epoch": 0.19053, + "grad_norm": 1.0333947672152355, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 19053 + }, + { + "epoch": 0.19054, + "grad_norm": 1.0502654522138795, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 19054 + }, + { + "epoch": 0.19055, + "grad_norm": 0.8806951048462609, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 19055 + }, + { + "epoch": 0.19056, + "grad_norm": 0.7160675565186634, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 19056 + }, + { + "epoch": 0.19057, + "grad_norm": 0.800471976042363, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 19057 + }, + { + "epoch": 0.19058, + "grad_norm": 0.9237572272122008, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 19058 + }, + { + "epoch": 0.19059, + "grad_norm": 1.0207160767994679, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 19059 + }, + { + "epoch": 0.1906, + "grad_norm": 1.0155380104867275, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 19060 + }, + { + "epoch": 0.19061, + "grad_norm": 0.9719302098268005, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 19061 + }, + { + "epoch": 0.19062, + "grad_norm": 0.8827863254829319, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 19062 + }, + { + "epoch": 0.19063, + "grad_norm": 0.735191162169673, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 19063 + }, + { + "epoch": 0.19064, + "grad_norm": 0.8052584913638133, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 19064 + }, + { + "epoch": 0.19065, + "grad_norm": 0.8423568825317832, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 19065 + }, + { + "epoch": 0.19066, + "grad_norm": 0.9768182473156256, + "learning_rate": 0.003, + "loss": 4.067, + "step": 19066 + }, + { + "epoch": 0.19067, + "grad_norm": 1.1381319150884115, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 19067 + }, + { + "epoch": 0.19068, + "grad_norm": 0.9586070130478144, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 19068 + }, + { + "epoch": 0.19069, + "grad_norm": 1.1867842705878802, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 19069 + }, + { + "epoch": 0.1907, + "grad_norm": 0.8853121649755085, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 19070 + }, + { + "epoch": 0.19071, + "grad_norm": 0.8364749876177177, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 19071 + }, + { + "epoch": 0.19072, + "grad_norm": 0.7779189562093047, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 19072 + }, + { + "epoch": 0.19073, + "grad_norm": 0.7766213145962902, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 19073 + }, + { + "epoch": 0.19074, + "grad_norm": 0.776139594455867, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 19074 + }, + { + "epoch": 0.19075, + "grad_norm": 0.8693476495515625, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 19075 + }, + { + "epoch": 0.19076, + "grad_norm": 0.9944336576999422, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 19076 + }, + { + "epoch": 0.19077, + "grad_norm": 1.0381320802359264, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 19077 + }, + { + "epoch": 0.19078, + "grad_norm": 0.9919237266469918, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 19078 + }, + { + "epoch": 0.19079, + "grad_norm": 1.0211300167597395, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 19079 + }, + { + "epoch": 0.1908, + "grad_norm": 0.8948322404700847, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 19080 + }, + { + "epoch": 0.19081, + "grad_norm": 0.801754753027131, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 19081 + }, + { + "epoch": 0.19082, + "grad_norm": 0.7455925499217908, + "learning_rate": 0.003, + "loss": 4.059, + "step": 19082 + }, + { + "epoch": 0.19083, + "grad_norm": 0.7911114663148194, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 19083 + }, + { + "epoch": 0.19084, + "grad_norm": 0.8781286305127428, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 19084 + }, + { + "epoch": 0.19085, + "grad_norm": 0.9910629670424296, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 19085 + }, + { + "epoch": 0.19086, + "grad_norm": 1.1617636532351778, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 19086 + }, + { + "epoch": 0.19087, + "grad_norm": 0.9859533117451245, + "learning_rate": 0.003, + "loss": 4.073, + "step": 19087 + }, + { + "epoch": 0.19088, + "grad_norm": 0.9392208421502648, + "learning_rate": 0.003, + "loss": 4.094, + "step": 19088 + }, + { + "epoch": 0.19089, + "grad_norm": 0.9997116388962914, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 19089 + }, + { + "epoch": 0.1909, + "grad_norm": 1.2245682599145473, + "learning_rate": 0.003, + "loss": 4.073, + "step": 19090 + }, + { + "epoch": 0.19091, + "grad_norm": 0.8624504485237479, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 19091 + }, + { + "epoch": 0.19092, + "grad_norm": 0.7901900475596503, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 19092 + }, + { + "epoch": 0.19093, + "grad_norm": 0.7944658431544632, + "learning_rate": 0.003, + "loss": 4.059, + "step": 19093 + }, + { + "epoch": 0.19094, + "grad_norm": 0.612304868461562, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 19094 + }, + { + "epoch": 0.19095, + "grad_norm": 0.6407830815765895, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 19095 + }, + { + "epoch": 0.19096, + "grad_norm": 0.5426360117642127, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 19096 + }, + { + "epoch": 0.19097, + "grad_norm": 0.5338474522388726, + "learning_rate": 0.003, + "loss": 4.065, + "step": 19097 + }, + { + "epoch": 0.19098, + "grad_norm": 0.5101727978090598, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 19098 + }, + { + "epoch": 0.19099, + "grad_norm": 0.5955354350848842, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 19099 + }, + { + "epoch": 0.191, + "grad_norm": 0.8188553946198028, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 19100 + }, + { + "epoch": 0.19101, + "grad_norm": 1.0956303663338571, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 19101 + }, + { + "epoch": 0.19102, + "grad_norm": 1.2288352687088009, + "learning_rate": 0.003, + "loss": 4.046, + "step": 19102 + }, + { + "epoch": 0.19103, + "grad_norm": 0.6996486707725332, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 19103 + }, + { + "epoch": 0.19104, + "grad_norm": 0.6681169360270387, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 19104 + }, + { + "epoch": 0.19105, + "grad_norm": 0.6994247802933407, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 19105 + }, + { + "epoch": 0.19106, + "grad_norm": 0.7142269955146352, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 19106 + }, + { + "epoch": 0.19107, + "grad_norm": 0.6740016205411797, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 19107 + }, + { + "epoch": 0.19108, + "grad_norm": 0.7858392813953062, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 19108 + }, + { + "epoch": 0.19109, + "grad_norm": 0.9914323241276799, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 19109 + }, + { + "epoch": 0.1911, + "grad_norm": 1.201935745501774, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 19110 + }, + { + "epoch": 0.19111, + "grad_norm": 0.9219062035845695, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 19111 + }, + { + "epoch": 0.19112, + "grad_norm": 0.8901108010065315, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 19112 + }, + { + "epoch": 0.19113, + "grad_norm": 1.0587914695573095, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 19113 + }, + { + "epoch": 0.19114, + "grad_norm": 1.044476999135324, + "learning_rate": 0.003, + "loss": 4.04, + "step": 19114 + }, + { + "epoch": 0.19115, + "grad_norm": 0.9053654633231307, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 19115 + }, + { + "epoch": 0.19116, + "grad_norm": 0.8977068127848019, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 19116 + }, + { + "epoch": 0.19117, + "grad_norm": 0.8813162473390667, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 19117 + }, + { + "epoch": 0.19118, + "grad_norm": 0.8408219122726204, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 19118 + }, + { + "epoch": 0.19119, + "grad_norm": 0.7658415895624578, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 19119 + }, + { + "epoch": 0.1912, + "grad_norm": 0.8065178753069221, + "learning_rate": 0.003, + "loss": 4.075, + "step": 19120 + }, + { + "epoch": 0.19121, + "grad_norm": 0.8650369341587718, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 19121 + }, + { + "epoch": 0.19122, + "grad_norm": 0.8291494114776464, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 19122 + }, + { + "epoch": 0.19123, + "grad_norm": 0.8914803818441838, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 19123 + }, + { + "epoch": 0.19124, + "grad_norm": 1.102372637490019, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 19124 + }, + { + "epoch": 0.19125, + "grad_norm": 1.1368227004790603, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 19125 + }, + { + "epoch": 0.19126, + "grad_norm": 0.7957796561014767, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 19126 + }, + { + "epoch": 0.19127, + "grad_norm": 0.6904629736784584, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 19127 + }, + { + "epoch": 0.19128, + "grad_norm": 0.7618235887730825, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 19128 + }, + { + "epoch": 0.19129, + "grad_norm": 0.8652592177286308, + "learning_rate": 0.003, + "loss": 4.066, + "step": 19129 + }, + { + "epoch": 0.1913, + "grad_norm": 1.00902247292995, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 19130 + }, + { + "epoch": 0.19131, + "grad_norm": 1.1941754642577591, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 19131 + }, + { + "epoch": 0.19132, + "grad_norm": 1.0522844401627696, + "learning_rate": 0.003, + "loss": 4.061, + "step": 19132 + }, + { + "epoch": 0.19133, + "grad_norm": 1.1078820619546768, + "learning_rate": 0.003, + "loss": 4.039, + "step": 19133 + }, + { + "epoch": 0.19134, + "grad_norm": 0.8916828609188207, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 19134 + }, + { + "epoch": 0.19135, + "grad_norm": 0.7758340235402817, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 19135 + }, + { + "epoch": 0.19136, + "grad_norm": 0.758903533761499, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 19136 + }, + { + "epoch": 0.19137, + "grad_norm": 0.7985332910244933, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 19137 + }, + { + "epoch": 0.19138, + "grad_norm": 0.8872785736897502, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 19138 + }, + { + "epoch": 0.19139, + "grad_norm": 1.1690459392571224, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 19139 + }, + { + "epoch": 0.1914, + "grad_norm": 1.0149515533241773, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 19140 + }, + { + "epoch": 0.19141, + "grad_norm": 0.9234684303330382, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 19141 + }, + { + "epoch": 0.19142, + "grad_norm": 0.9974718989558207, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 19142 + }, + { + "epoch": 0.19143, + "grad_norm": 1.1275730156046364, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 19143 + }, + { + "epoch": 0.19144, + "grad_norm": 1.043525932775733, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 19144 + }, + { + "epoch": 0.19145, + "grad_norm": 0.9635472066146746, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 19145 + }, + { + "epoch": 0.19146, + "grad_norm": 0.9366549747109736, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 19146 + }, + { + "epoch": 0.19147, + "grad_norm": 0.9556797261385593, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 19147 + }, + { + "epoch": 0.19148, + "grad_norm": 0.9968319204491339, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 19148 + }, + { + "epoch": 0.19149, + "grad_norm": 0.9789725136212473, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 19149 + }, + { + "epoch": 0.1915, + "grad_norm": 0.921798913910574, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 19150 + }, + { + "epoch": 0.19151, + "grad_norm": 0.8979370775907503, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 19151 + }, + { + "epoch": 0.19152, + "grad_norm": 0.9881899412066255, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 19152 + }, + { + "epoch": 0.19153, + "grad_norm": 1.1728827136099387, + "learning_rate": 0.003, + "loss": 4.078, + "step": 19153 + }, + { + "epoch": 0.19154, + "grad_norm": 0.8900696701292256, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 19154 + }, + { + "epoch": 0.19155, + "grad_norm": 0.7598033224415386, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 19155 + }, + { + "epoch": 0.19156, + "grad_norm": 0.6770261811478786, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 19156 + }, + { + "epoch": 0.19157, + "grad_norm": 0.6570370501712361, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 19157 + }, + { + "epoch": 0.19158, + "grad_norm": 0.6639133765216619, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 19158 + }, + { + "epoch": 0.19159, + "grad_norm": 0.6723059500159161, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 19159 + }, + { + "epoch": 0.1916, + "grad_norm": 0.8238752444239678, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 19160 + }, + { + "epoch": 0.19161, + "grad_norm": 1.0240555657257644, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 19161 + }, + { + "epoch": 0.19162, + "grad_norm": 1.1364190715089406, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 19162 + }, + { + "epoch": 0.19163, + "grad_norm": 1.0650643978733716, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 19163 + }, + { + "epoch": 0.19164, + "grad_norm": 0.924337511105464, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 19164 + }, + { + "epoch": 0.19165, + "grad_norm": 0.7955589991008061, + "learning_rate": 0.003, + "loss": 4.043, + "step": 19165 + }, + { + "epoch": 0.19166, + "grad_norm": 0.778886009131878, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 19166 + }, + { + "epoch": 0.19167, + "grad_norm": 0.7292515234082881, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 19167 + }, + { + "epoch": 0.19168, + "grad_norm": 0.6684820134081613, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 19168 + }, + { + "epoch": 0.19169, + "grad_norm": 0.5816988168759027, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 19169 + }, + { + "epoch": 0.1917, + "grad_norm": 0.5620713084851875, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 19170 + }, + { + "epoch": 0.19171, + "grad_norm": 0.5974182691705774, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 19171 + }, + { + "epoch": 0.19172, + "grad_norm": 0.6767206688909244, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 19172 + }, + { + "epoch": 0.19173, + "grad_norm": 0.7746260951447896, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 19173 + }, + { + "epoch": 0.19174, + "grad_norm": 0.7642247781590363, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 19174 + }, + { + "epoch": 0.19175, + "grad_norm": 0.8457011397987604, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 19175 + }, + { + "epoch": 0.19176, + "grad_norm": 0.916909573871974, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 19176 + }, + { + "epoch": 0.19177, + "grad_norm": 0.9492617484767752, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 19177 + }, + { + "epoch": 0.19178, + "grad_norm": 1.0149750443993122, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 19178 + }, + { + "epoch": 0.19179, + "grad_norm": 1.0933227967891406, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 19179 + }, + { + "epoch": 0.1918, + "grad_norm": 0.9532154672568713, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 19180 + }, + { + "epoch": 0.19181, + "grad_norm": 1.4331795061474801, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 19181 + }, + { + "epoch": 0.19182, + "grad_norm": 0.8201519184611736, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 19182 + }, + { + "epoch": 0.19183, + "grad_norm": 0.9013677698648515, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 19183 + }, + { + "epoch": 0.19184, + "grad_norm": 0.943429964425819, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 19184 + }, + { + "epoch": 0.19185, + "grad_norm": 1.0073760741135105, + "learning_rate": 0.003, + "loss": 4.06, + "step": 19185 + }, + { + "epoch": 0.19186, + "grad_norm": 1.0929939784382448, + "learning_rate": 0.003, + "loss": 4.046, + "step": 19186 + }, + { + "epoch": 0.19187, + "grad_norm": 1.2413339685325235, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 19187 + }, + { + "epoch": 0.19188, + "grad_norm": 1.2020388720618786, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 19188 + }, + { + "epoch": 0.19189, + "grad_norm": 0.9481795929159604, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 19189 + }, + { + "epoch": 0.1919, + "grad_norm": 0.9919308863807638, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 19190 + }, + { + "epoch": 0.19191, + "grad_norm": 0.998820204590595, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 19191 + }, + { + "epoch": 0.19192, + "grad_norm": 1.0408854899950093, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 19192 + }, + { + "epoch": 0.19193, + "grad_norm": 1.04311058320516, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 19193 + }, + { + "epoch": 0.19194, + "grad_norm": 1.1941497721833318, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 19194 + }, + { + "epoch": 0.19195, + "grad_norm": 0.920139316469817, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 19195 + }, + { + "epoch": 0.19196, + "grad_norm": 0.9108519081658464, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 19196 + }, + { + "epoch": 0.19197, + "grad_norm": 0.9223702416502508, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 19197 + }, + { + "epoch": 0.19198, + "grad_norm": 0.8509627782828049, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 19198 + }, + { + "epoch": 0.19199, + "grad_norm": 0.7289220032264974, + "learning_rate": 0.003, + "loss": 4.088, + "step": 19199 + }, + { + "epoch": 0.192, + "grad_norm": 0.7224358854503841, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 19200 + }, + { + "epoch": 0.19201, + "grad_norm": 0.7544214058537576, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 19201 + }, + { + "epoch": 0.19202, + "grad_norm": 0.8092066777788773, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 19202 + }, + { + "epoch": 0.19203, + "grad_norm": 0.9368709707212719, + "learning_rate": 0.003, + "loss": 4.039, + "step": 19203 + }, + { + "epoch": 0.19204, + "grad_norm": 1.0931590885061948, + "learning_rate": 0.003, + "loss": 4.054, + "step": 19204 + }, + { + "epoch": 0.19205, + "grad_norm": 0.8770844795871859, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 19205 + }, + { + "epoch": 0.19206, + "grad_norm": 0.8306421817763099, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 19206 + }, + { + "epoch": 0.19207, + "grad_norm": 0.878648736336449, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 19207 + }, + { + "epoch": 0.19208, + "grad_norm": 0.8875133742494693, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 19208 + }, + { + "epoch": 0.19209, + "grad_norm": 0.793238499807842, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 19209 + }, + { + "epoch": 0.1921, + "grad_norm": 0.7757701288960257, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 19210 + }, + { + "epoch": 0.19211, + "grad_norm": 0.7824645907529354, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 19211 + }, + { + "epoch": 0.19212, + "grad_norm": 0.8177183774141822, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 19212 + }, + { + "epoch": 0.19213, + "grad_norm": 0.929446946537238, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 19213 + }, + { + "epoch": 0.19214, + "grad_norm": 1.0867526246727846, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 19214 + }, + { + "epoch": 0.19215, + "grad_norm": 0.9013704811261157, + "learning_rate": 0.003, + "loss": 4.085, + "step": 19215 + }, + { + "epoch": 0.19216, + "grad_norm": 0.7802080780659087, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 19216 + }, + { + "epoch": 0.19217, + "grad_norm": 0.7052820494757375, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 19217 + }, + { + "epoch": 0.19218, + "grad_norm": 0.6840988159643647, + "learning_rate": 0.003, + "loss": 4.1008, + "step": 19218 + }, + { + "epoch": 0.19219, + "grad_norm": 0.6891256990062292, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 19219 + }, + { + "epoch": 0.1922, + "grad_norm": 0.6592335983566835, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 19220 + }, + { + "epoch": 0.19221, + "grad_norm": 0.7092450067835178, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 19221 + }, + { + "epoch": 0.19222, + "grad_norm": 0.7737560399743477, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 19222 + }, + { + "epoch": 0.19223, + "grad_norm": 1.0353930285151072, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 19223 + }, + { + "epoch": 0.19224, + "grad_norm": 1.160953135931673, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 19224 + }, + { + "epoch": 0.19225, + "grad_norm": 1.0047042315574906, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 19225 + }, + { + "epoch": 0.19226, + "grad_norm": 1.094540519862959, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 19226 + }, + { + "epoch": 0.19227, + "grad_norm": 0.870391283859072, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 19227 + }, + { + "epoch": 0.19228, + "grad_norm": 0.8355690166155801, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 19228 + }, + { + "epoch": 0.19229, + "grad_norm": 0.8620252155285648, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 19229 + }, + { + "epoch": 0.1923, + "grad_norm": 0.9896399499042035, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 19230 + }, + { + "epoch": 0.19231, + "grad_norm": 0.9658354004622579, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 19231 + }, + { + "epoch": 0.19232, + "grad_norm": 0.9207134572200805, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 19232 + }, + { + "epoch": 0.19233, + "grad_norm": 0.9680384497789604, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 19233 + }, + { + "epoch": 0.19234, + "grad_norm": 0.979017491808022, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 19234 + }, + { + "epoch": 0.19235, + "grad_norm": 0.9468907270544002, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 19235 + }, + { + "epoch": 0.19236, + "grad_norm": 0.8974984596975909, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 19236 + }, + { + "epoch": 0.19237, + "grad_norm": 0.9601350008999757, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 19237 + }, + { + "epoch": 0.19238, + "grad_norm": 0.9336776046445143, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 19238 + }, + { + "epoch": 0.19239, + "grad_norm": 1.0093740356448468, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 19239 + }, + { + "epoch": 0.1924, + "grad_norm": 1.2536772971899799, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 19240 + }, + { + "epoch": 0.19241, + "grad_norm": 1.0182498434339136, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 19241 + }, + { + "epoch": 0.19242, + "grad_norm": 1.0837510483224895, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 19242 + }, + { + "epoch": 0.19243, + "grad_norm": 0.9225747469867477, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 19243 + }, + { + "epoch": 0.19244, + "grad_norm": 0.7844361174545726, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 19244 + }, + { + "epoch": 0.19245, + "grad_norm": 0.7574314633985662, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 19245 + }, + { + "epoch": 0.19246, + "grad_norm": 0.8707291741019877, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 19246 + }, + { + "epoch": 0.19247, + "grad_norm": 1.09006320519354, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 19247 + }, + { + "epoch": 0.19248, + "grad_norm": 1.0023955625391028, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 19248 + }, + { + "epoch": 0.19249, + "grad_norm": 0.9469880272587307, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 19249 + }, + { + "epoch": 0.1925, + "grad_norm": 0.9076851822165349, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 19250 + }, + { + "epoch": 0.19251, + "grad_norm": 0.9804263974244334, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 19251 + }, + { + "epoch": 0.19252, + "grad_norm": 1.0621242569370077, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 19252 + }, + { + "epoch": 0.19253, + "grad_norm": 0.8462923391150671, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 19253 + }, + { + "epoch": 0.19254, + "grad_norm": 0.9115455068799846, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 19254 + }, + { + "epoch": 0.19255, + "grad_norm": 1.0029584191222796, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 19255 + }, + { + "epoch": 0.19256, + "grad_norm": 0.9674100852940246, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 19256 + }, + { + "epoch": 0.19257, + "grad_norm": 0.8292408728437941, + "learning_rate": 0.003, + "loss": 4.036, + "step": 19257 + }, + { + "epoch": 0.19258, + "grad_norm": 1.009116301371756, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 19258 + }, + { + "epoch": 0.19259, + "grad_norm": 1.1966204992888037, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 19259 + }, + { + "epoch": 0.1926, + "grad_norm": 0.7878949451380268, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 19260 + }, + { + "epoch": 0.19261, + "grad_norm": 0.7808985411678049, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 19261 + }, + { + "epoch": 0.19262, + "grad_norm": 0.8547357589002595, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 19262 + }, + { + "epoch": 0.19263, + "grad_norm": 0.718288020292227, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 19263 + }, + { + "epoch": 0.19264, + "grad_norm": 0.6304679970206166, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 19264 + }, + { + "epoch": 0.19265, + "grad_norm": 0.724331469536231, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 19265 + }, + { + "epoch": 0.19266, + "grad_norm": 0.815068469050919, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 19266 + }, + { + "epoch": 0.19267, + "grad_norm": 0.8514287596550341, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 19267 + }, + { + "epoch": 0.19268, + "grad_norm": 0.9774310685272432, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 19268 + }, + { + "epoch": 0.19269, + "grad_norm": 1.0914727380265792, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 19269 + }, + { + "epoch": 0.1927, + "grad_norm": 0.9712674589547434, + "learning_rate": 0.003, + "loss": 4.055, + "step": 19270 + }, + { + "epoch": 0.19271, + "grad_norm": 1.0115780669642893, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 19271 + }, + { + "epoch": 0.19272, + "grad_norm": 0.8872210112799476, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 19272 + }, + { + "epoch": 0.19273, + "grad_norm": 0.7578652090507335, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 19273 + }, + { + "epoch": 0.19274, + "grad_norm": 0.759825135477429, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 19274 + }, + { + "epoch": 0.19275, + "grad_norm": 0.7997469392525652, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 19275 + }, + { + "epoch": 0.19276, + "grad_norm": 0.7291875124795859, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 19276 + }, + { + "epoch": 0.19277, + "grad_norm": 0.732910639686414, + "learning_rate": 0.003, + "loss": 4.093, + "step": 19277 + }, + { + "epoch": 0.19278, + "grad_norm": 0.7504022956990735, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 19278 + }, + { + "epoch": 0.19279, + "grad_norm": 0.7145183158646564, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 19279 + }, + { + "epoch": 0.1928, + "grad_norm": 0.6809551669791736, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 19280 + }, + { + "epoch": 0.19281, + "grad_norm": 0.7163662690691528, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 19281 + }, + { + "epoch": 0.19282, + "grad_norm": 0.7850505360953289, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 19282 + }, + { + "epoch": 0.19283, + "grad_norm": 0.9407779214143366, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 19283 + }, + { + "epoch": 0.19284, + "grad_norm": 1.4375717729943787, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 19284 + }, + { + "epoch": 0.19285, + "grad_norm": 0.684169909298226, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 19285 + }, + { + "epoch": 0.19286, + "grad_norm": 0.8695288130362363, + "learning_rate": 0.003, + "loss": 4.025, + "step": 19286 + }, + { + "epoch": 0.19287, + "grad_norm": 1.1242503477218868, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 19287 + }, + { + "epoch": 0.19288, + "grad_norm": 0.8466664857908782, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 19288 + }, + { + "epoch": 0.19289, + "grad_norm": 0.7283764504237402, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 19289 + }, + { + "epoch": 0.1929, + "grad_norm": 0.7640308214485745, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 19290 + }, + { + "epoch": 0.19291, + "grad_norm": 0.76545557633287, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 19291 + }, + { + "epoch": 0.19292, + "grad_norm": 0.8660208301542295, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 19292 + }, + { + "epoch": 0.19293, + "grad_norm": 1.006380575264228, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 19293 + }, + { + "epoch": 0.19294, + "grad_norm": 1.0174367698461975, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 19294 + }, + { + "epoch": 0.19295, + "grad_norm": 0.9156945148357316, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 19295 + }, + { + "epoch": 0.19296, + "grad_norm": 0.8934904545697673, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 19296 + }, + { + "epoch": 0.19297, + "grad_norm": 0.8166606962145884, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 19297 + }, + { + "epoch": 0.19298, + "grad_norm": 0.8204643793301916, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 19298 + }, + { + "epoch": 0.19299, + "grad_norm": 0.7884797328812746, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 19299 + }, + { + "epoch": 0.193, + "grad_norm": 0.8848245411938087, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 19300 + }, + { + "epoch": 0.19301, + "grad_norm": 0.9491213559661399, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 19301 + }, + { + "epoch": 0.19302, + "grad_norm": 1.201532040807617, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 19302 + }, + { + "epoch": 0.19303, + "grad_norm": 0.8820087392952594, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 19303 + }, + { + "epoch": 0.19304, + "grad_norm": 0.9067089569250883, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 19304 + }, + { + "epoch": 0.19305, + "grad_norm": 1.0074690436179412, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 19305 + }, + { + "epoch": 0.19306, + "grad_norm": 0.9882078433848459, + "learning_rate": 0.003, + "loss": 4.069, + "step": 19306 + }, + { + "epoch": 0.19307, + "grad_norm": 1.0188381908844888, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 19307 + }, + { + "epoch": 0.19308, + "grad_norm": 0.9821704739833307, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 19308 + }, + { + "epoch": 0.19309, + "grad_norm": 0.942424683799336, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 19309 + }, + { + "epoch": 0.1931, + "grad_norm": 0.9334417165161468, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 19310 + }, + { + "epoch": 0.19311, + "grad_norm": 1.0651321222090708, + "learning_rate": 0.003, + "loss": 4.068, + "step": 19311 + }, + { + "epoch": 0.19312, + "grad_norm": 1.2103711850247247, + "learning_rate": 0.003, + "loss": 4.073, + "step": 19312 + }, + { + "epoch": 0.19313, + "grad_norm": 0.8972642524800851, + "learning_rate": 0.003, + "loss": 4.1, + "step": 19313 + }, + { + "epoch": 0.19314, + "grad_norm": 0.8036964751306692, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 19314 + }, + { + "epoch": 0.19315, + "grad_norm": 0.726795320706453, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 19315 + }, + { + "epoch": 0.19316, + "grad_norm": 0.6811792939150739, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 19316 + }, + { + "epoch": 0.19317, + "grad_norm": 0.7024349095635795, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 19317 + }, + { + "epoch": 0.19318, + "grad_norm": 0.7572675667273833, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 19318 + }, + { + "epoch": 0.19319, + "grad_norm": 0.8285780744679329, + "learning_rate": 0.003, + "loss": 4.054, + "step": 19319 + }, + { + "epoch": 0.1932, + "grad_norm": 1.12929868918835, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 19320 + }, + { + "epoch": 0.19321, + "grad_norm": 0.9536326676893999, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 19321 + }, + { + "epoch": 0.19322, + "grad_norm": 0.9668487379190761, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 19322 + }, + { + "epoch": 0.19323, + "grad_norm": 0.9753566245316622, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 19323 + }, + { + "epoch": 0.19324, + "grad_norm": 1.0201313993998238, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 19324 + }, + { + "epoch": 0.19325, + "grad_norm": 0.9187757221333906, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 19325 + }, + { + "epoch": 0.19326, + "grad_norm": 0.8503241399014173, + "learning_rate": 0.003, + "loss": 4.067, + "step": 19326 + }, + { + "epoch": 0.19327, + "grad_norm": 0.6947144636157419, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 19327 + }, + { + "epoch": 0.19328, + "grad_norm": 0.6595103173716894, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 19328 + }, + { + "epoch": 0.19329, + "grad_norm": 0.7053577224401365, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 19329 + }, + { + "epoch": 0.1933, + "grad_norm": 0.8821891890970084, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 19330 + }, + { + "epoch": 0.19331, + "grad_norm": 0.9967821902363833, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 19331 + }, + { + "epoch": 0.19332, + "grad_norm": 1.1995869818275835, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 19332 + }, + { + "epoch": 0.19333, + "grad_norm": 0.9189640826374365, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 19333 + }, + { + "epoch": 0.19334, + "grad_norm": 1.0077012014502056, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 19334 + }, + { + "epoch": 0.19335, + "grad_norm": 0.9147865305334655, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 19335 + }, + { + "epoch": 0.19336, + "grad_norm": 0.9077212628452612, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 19336 + }, + { + "epoch": 0.19337, + "grad_norm": 0.9862699628741036, + "learning_rate": 0.003, + "loss": 4.056, + "step": 19337 + }, + { + "epoch": 0.19338, + "grad_norm": 1.1451910546203383, + "learning_rate": 0.003, + "loss": 4.102, + "step": 19338 + }, + { + "epoch": 0.19339, + "grad_norm": 0.8767077068542017, + "learning_rate": 0.003, + "loss": 4.064, + "step": 19339 + }, + { + "epoch": 0.1934, + "grad_norm": 0.8457171250838031, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 19340 + }, + { + "epoch": 0.19341, + "grad_norm": 0.7592240416015384, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 19341 + }, + { + "epoch": 0.19342, + "grad_norm": 0.6983669343797456, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 19342 + }, + { + "epoch": 0.19343, + "grad_norm": 0.7785188933601789, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 19343 + }, + { + "epoch": 0.19344, + "grad_norm": 0.9046180073407409, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 19344 + }, + { + "epoch": 0.19345, + "grad_norm": 1.0366997635756199, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 19345 + }, + { + "epoch": 0.19346, + "grad_norm": 1.0162979133688614, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 19346 + }, + { + "epoch": 0.19347, + "grad_norm": 0.9060762641110706, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 19347 + }, + { + "epoch": 0.19348, + "grad_norm": 0.9442767671289194, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 19348 + }, + { + "epoch": 0.19349, + "grad_norm": 0.9573196738046161, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 19349 + }, + { + "epoch": 0.1935, + "grad_norm": 1.026506296420836, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 19350 + }, + { + "epoch": 0.19351, + "grad_norm": 0.937231457026112, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 19351 + }, + { + "epoch": 0.19352, + "grad_norm": 1.0436098863612515, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 19352 + }, + { + "epoch": 0.19353, + "grad_norm": 1.1122056396611406, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 19353 + }, + { + "epoch": 0.19354, + "grad_norm": 0.7951754885001442, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 19354 + }, + { + "epoch": 0.19355, + "grad_norm": 0.731618306665379, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 19355 + }, + { + "epoch": 0.19356, + "grad_norm": 0.8169119441950538, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 19356 + }, + { + "epoch": 0.19357, + "grad_norm": 0.9951121658364063, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 19357 + }, + { + "epoch": 0.19358, + "grad_norm": 1.2601588573719498, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 19358 + }, + { + "epoch": 0.19359, + "grad_norm": 0.8177939686033251, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 19359 + }, + { + "epoch": 0.1936, + "grad_norm": 0.8505608578205448, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 19360 + }, + { + "epoch": 0.19361, + "grad_norm": 1.0998820770390672, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 19361 + }, + { + "epoch": 0.19362, + "grad_norm": 1.1157030478507055, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 19362 + }, + { + "epoch": 0.19363, + "grad_norm": 0.79935380789488, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 19363 + }, + { + "epoch": 0.19364, + "grad_norm": 0.7196371621609685, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 19364 + }, + { + "epoch": 0.19365, + "grad_norm": 0.7715266907600049, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 19365 + }, + { + "epoch": 0.19366, + "grad_norm": 0.644796637237514, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 19366 + }, + { + "epoch": 0.19367, + "grad_norm": 0.5818694184589748, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 19367 + }, + { + "epoch": 0.19368, + "grad_norm": 0.5469812151673934, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 19368 + }, + { + "epoch": 0.19369, + "grad_norm": 0.5642327960132972, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 19369 + }, + { + "epoch": 0.1937, + "grad_norm": 0.5491507178522439, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 19370 + }, + { + "epoch": 0.19371, + "grad_norm": 0.6178386048368959, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 19371 + }, + { + "epoch": 0.19372, + "grad_norm": 0.6792661145423722, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 19372 + }, + { + "epoch": 0.19373, + "grad_norm": 0.7579709646077045, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 19373 + }, + { + "epoch": 0.19374, + "grad_norm": 0.8043491088295369, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 19374 + }, + { + "epoch": 0.19375, + "grad_norm": 0.9173096652133536, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 19375 + }, + { + "epoch": 0.19376, + "grad_norm": 1.104770502970042, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 19376 + }, + { + "epoch": 0.19377, + "grad_norm": 0.9931044029743681, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 19377 + }, + { + "epoch": 0.19378, + "grad_norm": 1.2575693666209646, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 19378 + }, + { + "epoch": 0.19379, + "grad_norm": 0.7893045372481265, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 19379 + }, + { + "epoch": 0.1938, + "grad_norm": 0.7290089644437999, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 19380 + }, + { + "epoch": 0.19381, + "grad_norm": 0.7954150049719874, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 19381 + }, + { + "epoch": 0.19382, + "grad_norm": 0.9335877267014221, + "learning_rate": 0.003, + "loss": 4.072, + "step": 19382 + }, + { + "epoch": 0.19383, + "grad_norm": 1.1566640119912726, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 19383 + }, + { + "epoch": 0.19384, + "grad_norm": 0.9305220064139013, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 19384 + }, + { + "epoch": 0.19385, + "grad_norm": 0.9284443858567036, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 19385 + }, + { + "epoch": 0.19386, + "grad_norm": 0.8698772978575846, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 19386 + }, + { + "epoch": 0.19387, + "grad_norm": 0.9039789823960264, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 19387 + }, + { + "epoch": 0.19388, + "grad_norm": 0.8938361441679321, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 19388 + }, + { + "epoch": 0.19389, + "grad_norm": 1.079251678335314, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 19389 + }, + { + "epoch": 0.1939, + "grad_norm": 0.9888540957536921, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 19390 + }, + { + "epoch": 0.19391, + "grad_norm": 0.8653988459312033, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 19391 + }, + { + "epoch": 0.19392, + "grad_norm": 0.7387427348858798, + "learning_rate": 0.003, + "loss": 4.052, + "step": 19392 + }, + { + "epoch": 0.19393, + "grad_norm": 0.7195453908865173, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 19393 + }, + { + "epoch": 0.19394, + "grad_norm": 1.0132534843416474, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 19394 + }, + { + "epoch": 0.19395, + "grad_norm": 1.438402311165173, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 19395 + }, + { + "epoch": 0.19396, + "grad_norm": 0.6832755764345744, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 19396 + }, + { + "epoch": 0.19397, + "grad_norm": 0.7173595937643056, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 19397 + }, + { + "epoch": 0.19398, + "grad_norm": 0.7412058429957814, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 19398 + }, + { + "epoch": 0.19399, + "grad_norm": 0.7348205577739316, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 19399 + }, + { + "epoch": 0.194, + "grad_norm": 0.7589437145220229, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 19400 + }, + { + "epoch": 0.19401, + "grad_norm": 0.7744153690054988, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 19401 + }, + { + "epoch": 0.19402, + "grad_norm": 0.7842889187210925, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 19402 + }, + { + "epoch": 0.19403, + "grad_norm": 0.693093910166812, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 19403 + }, + { + "epoch": 0.19404, + "grad_norm": 0.6746526719944133, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 19404 + }, + { + "epoch": 0.19405, + "grad_norm": 0.6850649742412825, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 19405 + }, + { + "epoch": 0.19406, + "grad_norm": 0.8077518972655819, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 19406 + }, + { + "epoch": 0.19407, + "grad_norm": 0.9910010785169938, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 19407 + }, + { + "epoch": 0.19408, + "grad_norm": 1.2453570176589803, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 19408 + }, + { + "epoch": 0.19409, + "grad_norm": 0.8234145584135474, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 19409 + }, + { + "epoch": 0.1941, + "grad_norm": 0.8663858509496111, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 19410 + }, + { + "epoch": 0.19411, + "grad_norm": 1.0228186203156835, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 19411 + }, + { + "epoch": 0.19412, + "grad_norm": 1.184175206863987, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 19412 + }, + { + "epoch": 0.19413, + "grad_norm": 0.8128680746578928, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 19413 + }, + { + "epoch": 0.19414, + "grad_norm": 0.8349677648661027, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 19414 + }, + { + "epoch": 0.19415, + "grad_norm": 0.839110250731865, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 19415 + }, + { + "epoch": 0.19416, + "grad_norm": 1.0530972382185582, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 19416 + }, + { + "epoch": 0.19417, + "grad_norm": 1.2071529196567012, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 19417 + }, + { + "epoch": 0.19418, + "grad_norm": 0.7532693429543044, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 19418 + }, + { + "epoch": 0.19419, + "grad_norm": 0.7025049287295203, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 19419 + }, + { + "epoch": 0.1942, + "grad_norm": 0.6617869994428833, + "learning_rate": 0.003, + "loss": 4.034, + "step": 19420 + }, + { + "epoch": 0.19421, + "grad_norm": 0.7107982239512384, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 19421 + }, + { + "epoch": 0.19422, + "grad_norm": 0.7731539221854351, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 19422 + }, + { + "epoch": 0.19423, + "grad_norm": 0.7954650930536287, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 19423 + }, + { + "epoch": 0.19424, + "grad_norm": 0.744812551693108, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 19424 + }, + { + "epoch": 0.19425, + "grad_norm": 0.7845461767458478, + "learning_rate": 0.003, + "loss": 4.046, + "step": 19425 + }, + { + "epoch": 0.19426, + "grad_norm": 0.7963268991869785, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 19426 + }, + { + "epoch": 0.19427, + "grad_norm": 0.9046663135799429, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 19427 + }, + { + "epoch": 0.19428, + "grad_norm": 1.1986557570670113, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 19428 + }, + { + "epoch": 0.19429, + "grad_norm": 0.8848341229192824, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 19429 + }, + { + "epoch": 0.1943, + "grad_norm": 0.7520684600772797, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 19430 + }, + { + "epoch": 0.19431, + "grad_norm": 0.7599682318677224, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 19431 + }, + { + "epoch": 0.19432, + "grad_norm": 0.8557189101886336, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 19432 + }, + { + "epoch": 0.19433, + "grad_norm": 0.9104566496816495, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 19433 + }, + { + "epoch": 0.19434, + "grad_norm": 1.0351919580811069, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 19434 + }, + { + "epoch": 0.19435, + "grad_norm": 1.240388030759488, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 19435 + }, + { + "epoch": 0.19436, + "grad_norm": 1.1308522532143102, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 19436 + }, + { + "epoch": 0.19437, + "grad_norm": 1.106405125898721, + "learning_rate": 0.003, + "loss": 4.046, + "step": 19437 + }, + { + "epoch": 0.19438, + "grad_norm": 0.9453742851113698, + "learning_rate": 0.003, + "loss": 4.079, + "step": 19438 + }, + { + "epoch": 0.19439, + "grad_norm": 0.8606477423158553, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 19439 + }, + { + "epoch": 0.1944, + "grad_norm": 0.8429940189120442, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 19440 + }, + { + "epoch": 0.19441, + "grad_norm": 0.7464507941703405, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 19441 + }, + { + "epoch": 0.19442, + "grad_norm": 0.7801455651340992, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 19442 + }, + { + "epoch": 0.19443, + "grad_norm": 0.792522750942894, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 19443 + }, + { + "epoch": 0.19444, + "grad_norm": 0.788629505743987, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 19444 + }, + { + "epoch": 0.19445, + "grad_norm": 0.8301473853232875, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 19445 + }, + { + "epoch": 0.19446, + "grad_norm": 0.8608611945764904, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 19446 + }, + { + "epoch": 0.19447, + "grad_norm": 0.9974219292655547, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 19447 + }, + { + "epoch": 0.19448, + "grad_norm": 1.318589946379268, + "learning_rate": 0.003, + "loss": 4.059, + "step": 19448 + }, + { + "epoch": 0.19449, + "grad_norm": 0.7471531950751678, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 19449 + }, + { + "epoch": 0.1945, + "grad_norm": 0.8234966459974362, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 19450 + }, + { + "epoch": 0.19451, + "grad_norm": 0.8365819171105646, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 19451 + }, + { + "epoch": 0.19452, + "grad_norm": 0.9296322955615434, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 19452 + }, + { + "epoch": 0.19453, + "grad_norm": 1.0848704779287814, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 19453 + }, + { + "epoch": 0.19454, + "grad_norm": 0.8588849849266655, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 19454 + }, + { + "epoch": 0.19455, + "grad_norm": 0.8562132116638607, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 19455 + }, + { + "epoch": 0.19456, + "grad_norm": 0.8576836587299588, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 19456 + }, + { + "epoch": 0.19457, + "grad_norm": 1.0178256252910618, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 19457 + }, + { + "epoch": 0.19458, + "grad_norm": 1.0673845665305683, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 19458 + }, + { + "epoch": 0.19459, + "grad_norm": 0.9526930015094522, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 19459 + }, + { + "epoch": 0.1946, + "grad_norm": 1.0773885084350963, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 19460 + }, + { + "epoch": 0.19461, + "grad_norm": 1.020877896151739, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 19461 + }, + { + "epoch": 0.19462, + "grad_norm": 1.0136060535129237, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 19462 + }, + { + "epoch": 0.19463, + "grad_norm": 0.9954529145757499, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 19463 + }, + { + "epoch": 0.19464, + "grad_norm": 1.0596088310554157, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 19464 + }, + { + "epoch": 0.19465, + "grad_norm": 1.1154382364130757, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 19465 + }, + { + "epoch": 0.19466, + "grad_norm": 1.1784204764828055, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 19466 + }, + { + "epoch": 0.19467, + "grad_norm": 0.9259931971512808, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 19467 + }, + { + "epoch": 0.19468, + "grad_norm": 0.9416683844112438, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 19468 + }, + { + "epoch": 0.19469, + "grad_norm": 1.2413378354355256, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 19469 + }, + { + "epoch": 0.1947, + "grad_norm": 0.9270447216921386, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 19470 + }, + { + "epoch": 0.19471, + "grad_norm": 0.9545181686146227, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 19471 + }, + { + "epoch": 0.19472, + "grad_norm": 1.0339319753001825, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 19472 + }, + { + "epoch": 0.19473, + "grad_norm": 1.000844535995447, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 19473 + }, + { + "epoch": 0.19474, + "grad_norm": 0.9143181943814181, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 19474 + }, + { + "epoch": 0.19475, + "grad_norm": 0.8602119219325912, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 19475 + }, + { + "epoch": 0.19476, + "grad_norm": 0.9540118603831539, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 19476 + }, + { + "epoch": 0.19477, + "grad_norm": 0.9782597139022668, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 19477 + }, + { + "epoch": 0.19478, + "grad_norm": 0.9962543688764389, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 19478 + }, + { + "epoch": 0.19479, + "grad_norm": 0.7779573812208588, + "learning_rate": 0.003, + "loss": 4.058, + "step": 19479 + }, + { + "epoch": 0.1948, + "grad_norm": 0.7301011933193419, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 19480 + }, + { + "epoch": 0.19481, + "grad_norm": 0.7707368473789574, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 19481 + }, + { + "epoch": 0.19482, + "grad_norm": 0.8470135701871124, + "learning_rate": 0.003, + "loss": 4.073, + "step": 19482 + }, + { + "epoch": 0.19483, + "grad_norm": 0.8458066880417292, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 19483 + }, + { + "epoch": 0.19484, + "grad_norm": 0.8337140274876754, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 19484 + }, + { + "epoch": 0.19485, + "grad_norm": 0.7926767981455103, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 19485 + }, + { + "epoch": 0.19486, + "grad_norm": 0.8656239751429737, + "learning_rate": 0.003, + "loss": 4.071, + "step": 19486 + }, + { + "epoch": 0.19487, + "grad_norm": 0.9838859096652176, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 19487 + }, + { + "epoch": 0.19488, + "grad_norm": 0.9871605794357838, + "learning_rate": 0.003, + "loss": 4.048, + "step": 19488 + }, + { + "epoch": 0.19489, + "grad_norm": 1.0088434336621808, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 19489 + }, + { + "epoch": 0.1949, + "grad_norm": 1.1324089063831135, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 19490 + }, + { + "epoch": 0.19491, + "grad_norm": 0.9202283600730529, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 19491 + }, + { + "epoch": 0.19492, + "grad_norm": 0.7821970084215664, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 19492 + }, + { + "epoch": 0.19493, + "grad_norm": 0.6611199839192118, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 19493 + }, + { + "epoch": 0.19494, + "grad_norm": 0.6393568576028075, + "learning_rate": 0.003, + "loss": 4.068, + "step": 19494 + }, + { + "epoch": 0.19495, + "grad_norm": 0.7050742852054983, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 19495 + }, + { + "epoch": 0.19496, + "grad_norm": 0.6817189794167758, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 19496 + }, + { + "epoch": 0.19497, + "grad_norm": 0.7594130011999084, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 19497 + }, + { + "epoch": 0.19498, + "grad_norm": 0.7333688561420428, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 19498 + }, + { + "epoch": 0.19499, + "grad_norm": 0.7123112115752411, + "learning_rate": 0.003, + "loss": 4.07, + "step": 19499 + }, + { + "epoch": 0.195, + "grad_norm": 0.7042225273617304, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 19500 + }, + { + "epoch": 0.19501, + "grad_norm": 0.8029016193437605, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 19501 + }, + { + "epoch": 0.19502, + "grad_norm": 0.9922773101680152, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 19502 + }, + { + "epoch": 0.19503, + "grad_norm": 1.1466301986118883, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 19503 + }, + { + "epoch": 0.19504, + "grad_norm": 0.8807567321460502, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 19504 + }, + { + "epoch": 0.19505, + "grad_norm": 1.0379231428547009, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 19505 + }, + { + "epoch": 0.19506, + "grad_norm": 1.164717902234979, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 19506 + }, + { + "epoch": 0.19507, + "grad_norm": 0.8425390274820713, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 19507 + }, + { + "epoch": 0.19508, + "grad_norm": 0.7959424547295146, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 19508 + }, + { + "epoch": 0.19509, + "grad_norm": 0.8632302355867209, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 19509 + }, + { + "epoch": 0.1951, + "grad_norm": 0.881691131147382, + "learning_rate": 0.003, + "loss": 4.058, + "step": 19510 + }, + { + "epoch": 0.19511, + "grad_norm": 0.9925927341296379, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 19511 + }, + { + "epoch": 0.19512, + "grad_norm": 1.28965793777758, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 19512 + }, + { + "epoch": 0.19513, + "grad_norm": 0.7905992924507371, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 19513 + }, + { + "epoch": 0.19514, + "grad_norm": 0.6384576253160198, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 19514 + }, + { + "epoch": 0.19515, + "grad_norm": 0.6166443571201118, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 19515 + }, + { + "epoch": 0.19516, + "grad_norm": 0.6867575151399782, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 19516 + }, + { + "epoch": 0.19517, + "grad_norm": 0.7685770719860647, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 19517 + }, + { + "epoch": 0.19518, + "grad_norm": 0.8594828737342208, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 19518 + }, + { + "epoch": 0.19519, + "grad_norm": 0.9872709054688854, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 19519 + }, + { + "epoch": 0.1952, + "grad_norm": 1.2125394175737512, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 19520 + }, + { + "epoch": 0.19521, + "grad_norm": 0.886659689134626, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 19521 + }, + { + "epoch": 0.19522, + "grad_norm": 0.9636041826651245, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 19522 + }, + { + "epoch": 0.19523, + "grad_norm": 1.205013653960159, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 19523 + }, + { + "epoch": 0.19524, + "grad_norm": 0.9724836120974313, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 19524 + }, + { + "epoch": 0.19525, + "grad_norm": 1.0118739956056824, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 19525 + }, + { + "epoch": 0.19526, + "grad_norm": 1.0064466822541676, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 19526 + }, + { + "epoch": 0.19527, + "grad_norm": 0.9651436841120309, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 19527 + }, + { + "epoch": 0.19528, + "grad_norm": 0.8937188861082559, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 19528 + }, + { + "epoch": 0.19529, + "grad_norm": 0.8821529187386105, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 19529 + }, + { + "epoch": 0.1953, + "grad_norm": 0.8252793530031995, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 19530 + }, + { + "epoch": 0.19531, + "grad_norm": 0.7651803369307546, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 19531 + }, + { + "epoch": 0.19532, + "grad_norm": 0.8058864113766969, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 19532 + }, + { + "epoch": 0.19533, + "grad_norm": 1.0366595185288165, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 19533 + }, + { + "epoch": 0.19534, + "grad_norm": 1.051722844597907, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 19534 + }, + { + "epoch": 0.19535, + "grad_norm": 0.9434461648697393, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 19535 + }, + { + "epoch": 0.19536, + "grad_norm": 0.9725778695812081, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 19536 + }, + { + "epoch": 0.19537, + "grad_norm": 1.017590227573199, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 19537 + }, + { + "epoch": 0.19538, + "grad_norm": 0.9968575972285362, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 19538 + }, + { + "epoch": 0.19539, + "grad_norm": 0.9359579432709345, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 19539 + }, + { + "epoch": 0.1954, + "grad_norm": 0.8686704791280276, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 19540 + }, + { + "epoch": 0.19541, + "grad_norm": 0.897106152786685, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 19541 + }, + { + "epoch": 0.19542, + "grad_norm": 1.0077546120042802, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 19542 + }, + { + "epoch": 0.19543, + "grad_norm": 1.0636340344146884, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 19543 + }, + { + "epoch": 0.19544, + "grad_norm": 1.0095013765477652, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 19544 + }, + { + "epoch": 0.19545, + "grad_norm": 1.030678871683277, + "learning_rate": 0.003, + "loss": 4.075, + "step": 19545 + }, + { + "epoch": 0.19546, + "grad_norm": 1.0129705246150944, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 19546 + }, + { + "epoch": 0.19547, + "grad_norm": 0.9631765361708327, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 19547 + }, + { + "epoch": 0.19548, + "grad_norm": 0.963864305064408, + "learning_rate": 0.003, + "loss": 4.076, + "step": 19548 + }, + { + "epoch": 0.19549, + "grad_norm": 0.9961369640847921, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 19549 + }, + { + "epoch": 0.1955, + "grad_norm": 0.9152450673234607, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 19550 + }, + { + "epoch": 0.19551, + "grad_norm": 0.8857343149036193, + "learning_rate": 0.003, + "loss": 4.092, + "step": 19551 + }, + { + "epoch": 0.19552, + "grad_norm": 0.9627084866512174, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 19552 + }, + { + "epoch": 0.19553, + "grad_norm": 1.0490876132405937, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 19553 + }, + { + "epoch": 0.19554, + "grad_norm": 0.8631105090005501, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 19554 + }, + { + "epoch": 0.19555, + "grad_norm": 0.7983585693124079, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 19555 + }, + { + "epoch": 0.19556, + "grad_norm": 0.911049293813308, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 19556 + }, + { + "epoch": 0.19557, + "grad_norm": 1.0179910902311773, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 19557 + }, + { + "epoch": 0.19558, + "grad_norm": 0.9814181656212816, + "learning_rate": 0.003, + "loss": 4.055, + "step": 19558 + }, + { + "epoch": 0.19559, + "grad_norm": 0.9199296198007286, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 19559 + }, + { + "epoch": 0.1956, + "grad_norm": 0.8530861861107941, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 19560 + }, + { + "epoch": 0.19561, + "grad_norm": 0.8013947064723252, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 19561 + }, + { + "epoch": 0.19562, + "grad_norm": 0.8313204093885591, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 19562 + }, + { + "epoch": 0.19563, + "grad_norm": 0.8264158893586153, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 19563 + }, + { + "epoch": 0.19564, + "grad_norm": 0.8277466957586677, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 19564 + }, + { + "epoch": 0.19565, + "grad_norm": 0.9135284567351104, + "learning_rate": 0.003, + "loss": 4.035, + "step": 19565 + }, + { + "epoch": 0.19566, + "grad_norm": 1.0679433935045328, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 19566 + }, + { + "epoch": 0.19567, + "grad_norm": 0.9851444389545205, + "learning_rate": 0.003, + "loss": 4.058, + "step": 19567 + }, + { + "epoch": 0.19568, + "grad_norm": 0.8402413726976194, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 19568 + }, + { + "epoch": 0.19569, + "grad_norm": 0.9457773671925149, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 19569 + }, + { + "epoch": 0.1957, + "grad_norm": 1.1822131417765054, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 19570 + }, + { + "epoch": 0.19571, + "grad_norm": 0.7814700340209935, + "learning_rate": 0.003, + "loss": 4.064, + "step": 19571 + }, + { + "epoch": 0.19572, + "grad_norm": 0.7549355950312976, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 19572 + }, + { + "epoch": 0.19573, + "grad_norm": 0.8191030608931872, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 19573 + }, + { + "epoch": 0.19574, + "grad_norm": 0.8167295038611587, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 19574 + }, + { + "epoch": 0.19575, + "grad_norm": 0.8155217987813517, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 19575 + }, + { + "epoch": 0.19576, + "grad_norm": 0.7620612768135239, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 19576 + }, + { + "epoch": 0.19577, + "grad_norm": 0.7508875732334481, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 19577 + }, + { + "epoch": 0.19578, + "grad_norm": 0.6591786381246394, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 19578 + }, + { + "epoch": 0.19579, + "grad_norm": 0.6665442541428873, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 19579 + }, + { + "epoch": 0.1958, + "grad_norm": 0.7184883388085611, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 19580 + }, + { + "epoch": 0.19581, + "grad_norm": 0.6716101243677017, + "learning_rate": 0.003, + "loss": 4.005, + "step": 19581 + }, + { + "epoch": 0.19582, + "grad_norm": 0.5931393714437542, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 19582 + }, + { + "epoch": 0.19583, + "grad_norm": 0.5237593632232761, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 19583 + }, + { + "epoch": 0.19584, + "grad_norm": 0.5447536023317237, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 19584 + }, + { + "epoch": 0.19585, + "grad_norm": 0.5514066401220156, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 19585 + }, + { + "epoch": 0.19586, + "grad_norm": 0.5876951881811275, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 19586 + }, + { + "epoch": 0.19587, + "grad_norm": 0.7803656706981374, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 19587 + }, + { + "epoch": 0.19588, + "grad_norm": 1.0531089546767054, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 19588 + }, + { + "epoch": 0.19589, + "grad_norm": 1.2906770227877205, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 19589 + }, + { + "epoch": 0.1959, + "grad_norm": 0.640725072928441, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 19590 + }, + { + "epoch": 0.19591, + "grad_norm": 0.765682506484728, + "learning_rate": 0.003, + "loss": 4.073, + "step": 19591 + }, + { + "epoch": 0.19592, + "grad_norm": 0.8051515711380889, + "learning_rate": 0.003, + "loss": 4.066, + "step": 19592 + }, + { + "epoch": 0.19593, + "grad_norm": 0.8090570490624589, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 19593 + }, + { + "epoch": 0.19594, + "grad_norm": 0.9148053656372326, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 19594 + }, + { + "epoch": 0.19595, + "grad_norm": 0.9588129993589732, + "learning_rate": 0.003, + "loss": 4.047, + "step": 19595 + }, + { + "epoch": 0.19596, + "grad_norm": 1.0845422131169817, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 19596 + }, + { + "epoch": 0.19597, + "grad_norm": 1.2405161970936593, + "learning_rate": 0.003, + "loss": 4.087, + "step": 19597 + }, + { + "epoch": 0.19598, + "grad_norm": 0.8763254362895997, + "learning_rate": 0.003, + "loss": 4.039, + "step": 19598 + }, + { + "epoch": 0.19599, + "grad_norm": 0.8397382645251689, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 19599 + }, + { + "epoch": 0.196, + "grad_norm": 0.8324375316149644, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 19600 + }, + { + "epoch": 0.19601, + "grad_norm": 0.8671578177626034, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 19601 + }, + { + "epoch": 0.19602, + "grad_norm": 0.8984845395036228, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 19602 + }, + { + "epoch": 0.19603, + "grad_norm": 0.9138350481391048, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 19603 + }, + { + "epoch": 0.19604, + "grad_norm": 0.8679316485996372, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 19604 + }, + { + "epoch": 0.19605, + "grad_norm": 1.0142894792470354, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 19605 + }, + { + "epoch": 0.19606, + "grad_norm": 1.1463352220278804, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 19606 + }, + { + "epoch": 0.19607, + "grad_norm": 0.9032755723109397, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 19607 + }, + { + "epoch": 0.19608, + "grad_norm": 0.9756126186131024, + "learning_rate": 0.003, + "loss": 4.054, + "step": 19608 + }, + { + "epoch": 0.19609, + "grad_norm": 1.0252085137602285, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 19609 + }, + { + "epoch": 0.1961, + "grad_norm": 1.0275481471237282, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 19610 + }, + { + "epoch": 0.19611, + "grad_norm": 1.1091289142283627, + "learning_rate": 0.003, + "loss": 4.072, + "step": 19611 + }, + { + "epoch": 0.19612, + "grad_norm": 1.0041349989429535, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 19612 + }, + { + "epoch": 0.19613, + "grad_norm": 1.0752228600198706, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 19613 + }, + { + "epoch": 0.19614, + "grad_norm": 0.7712905298214182, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 19614 + }, + { + "epoch": 0.19615, + "grad_norm": 0.75575047929902, + "learning_rate": 0.003, + "loss": 4.049, + "step": 19615 + }, + { + "epoch": 0.19616, + "grad_norm": 0.8029741166981483, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 19616 + }, + { + "epoch": 0.19617, + "grad_norm": 0.8626511772695846, + "learning_rate": 0.003, + "loss": 4.088, + "step": 19617 + }, + { + "epoch": 0.19618, + "grad_norm": 0.8672605456043144, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 19618 + }, + { + "epoch": 0.19619, + "grad_norm": 0.906518472965422, + "learning_rate": 0.003, + "loss": 4.067, + "step": 19619 + }, + { + "epoch": 0.1962, + "grad_norm": 0.978468408039395, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 19620 + }, + { + "epoch": 0.19621, + "grad_norm": 1.14111150112848, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 19621 + }, + { + "epoch": 0.19622, + "grad_norm": 0.8650014457284066, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 19622 + }, + { + "epoch": 0.19623, + "grad_norm": 0.9636491762678555, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 19623 + }, + { + "epoch": 0.19624, + "grad_norm": 1.0588919339234388, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 19624 + }, + { + "epoch": 0.19625, + "grad_norm": 0.863972905355138, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 19625 + }, + { + "epoch": 0.19626, + "grad_norm": 0.834111566098492, + "learning_rate": 0.003, + "loss": 4.077, + "step": 19626 + }, + { + "epoch": 0.19627, + "grad_norm": 0.7487609764255219, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 19627 + }, + { + "epoch": 0.19628, + "grad_norm": 0.6943484442750629, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 19628 + }, + { + "epoch": 0.19629, + "grad_norm": 0.70303815508834, + "learning_rate": 0.003, + "loss": 4.044, + "step": 19629 + }, + { + "epoch": 0.1963, + "grad_norm": 0.7445017089096984, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 19630 + }, + { + "epoch": 0.19631, + "grad_norm": 0.8521821108803652, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 19631 + }, + { + "epoch": 0.19632, + "grad_norm": 1.0450800327161265, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 19632 + }, + { + "epoch": 0.19633, + "grad_norm": 1.3012511420639266, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 19633 + }, + { + "epoch": 0.19634, + "grad_norm": 0.6218729762126922, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 19634 + }, + { + "epoch": 0.19635, + "grad_norm": 0.775632845745399, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 19635 + }, + { + "epoch": 0.19636, + "grad_norm": 0.8638470958774185, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 19636 + }, + { + "epoch": 0.19637, + "grad_norm": 0.9111695509287965, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 19637 + }, + { + "epoch": 0.19638, + "grad_norm": 0.9149375831134107, + "learning_rate": 0.003, + "loss": 4.0031, + "step": 19638 + }, + { + "epoch": 0.19639, + "grad_norm": 0.9230680816971693, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 19639 + }, + { + "epoch": 0.1964, + "grad_norm": 0.9303667561199962, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 19640 + }, + { + "epoch": 0.19641, + "grad_norm": 0.9743412657753244, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 19641 + }, + { + "epoch": 0.19642, + "grad_norm": 1.0421841288257094, + "learning_rate": 0.003, + "loss": 4.053, + "step": 19642 + }, + { + "epoch": 0.19643, + "grad_norm": 1.0021674751672769, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 19643 + }, + { + "epoch": 0.19644, + "grad_norm": 1.0675909299862363, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 19644 + }, + { + "epoch": 0.19645, + "grad_norm": 1.0107519134522043, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 19645 + }, + { + "epoch": 0.19646, + "grad_norm": 1.1025867901172475, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 19646 + }, + { + "epoch": 0.19647, + "grad_norm": 0.8284739080966057, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 19647 + }, + { + "epoch": 0.19648, + "grad_norm": 0.7270951908958995, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 19648 + }, + { + "epoch": 0.19649, + "grad_norm": 0.7710490808011464, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 19649 + }, + { + "epoch": 0.1965, + "grad_norm": 0.8848802236200047, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 19650 + }, + { + "epoch": 0.19651, + "grad_norm": 1.0774926919785093, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 19651 + }, + { + "epoch": 0.19652, + "grad_norm": 1.1362621198906069, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 19652 + }, + { + "epoch": 0.19653, + "grad_norm": 0.9247064662537829, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 19653 + }, + { + "epoch": 0.19654, + "grad_norm": 0.8936436484304016, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 19654 + }, + { + "epoch": 0.19655, + "grad_norm": 0.98716827264183, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 19655 + }, + { + "epoch": 0.19656, + "grad_norm": 1.0106779134206885, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 19656 + }, + { + "epoch": 0.19657, + "grad_norm": 0.8389634395452044, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 19657 + }, + { + "epoch": 0.19658, + "grad_norm": 0.8126126389489013, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 19658 + }, + { + "epoch": 0.19659, + "grad_norm": 0.7590071171626337, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 19659 + }, + { + "epoch": 0.1966, + "grad_norm": 0.7992150930481975, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 19660 + }, + { + "epoch": 0.19661, + "grad_norm": 0.9637472510188058, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 19661 + }, + { + "epoch": 0.19662, + "grad_norm": 1.111990045897436, + "learning_rate": 0.003, + "loss": 4.087, + "step": 19662 + }, + { + "epoch": 0.19663, + "grad_norm": 0.9616634759268835, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 19663 + }, + { + "epoch": 0.19664, + "grad_norm": 1.0109756472601052, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 19664 + }, + { + "epoch": 0.19665, + "grad_norm": 0.9784657538349492, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 19665 + }, + { + "epoch": 0.19666, + "grad_norm": 1.0233782450256468, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 19666 + }, + { + "epoch": 0.19667, + "grad_norm": 1.0384438121030655, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 19667 + }, + { + "epoch": 0.19668, + "grad_norm": 1.0661705947840365, + "learning_rate": 0.003, + "loss": 4.054, + "step": 19668 + }, + { + "epoch": 0.19669, + "grad_norm": 0.9395447683901564, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 19669 + }, + { + "epoch": 0.1967, + "grad_norm": 0.9558765034267854, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 19670 + }, + { + "epoch": 0.19671, + "grad_norm": 1.1608338068984734, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 19671 + }, + { + "epoch": 0.19672, + "grad_norm": 0.751740136424847, + "learning_rate": 0.003, + "loss": 4.034, + "step": 19672 + }, + { + "epoch": 0.19673, + "grad_norm": 0.6821824535053846, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 19673 + }, + { + "epoch": 0.19674, + "grad_norm": 0.6404662033617751, + "learning_rate": 0.003, + "loss": 4.032, + "step": 19674 + }, + { + "epoch": 0.19675, + "grad_norm": 0.7087708448421819, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 19675 + }, + { + "epoch": 0.19676, + "grad_norm": 0.9362332098376248, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 19676 + }, + { + "epoch": 0.19677, + "grad_norm": 1.1325899664530585, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 19677 + }, + { + "epoch": 0.19678, + "grad_norm": 0.9202287386628409, + "learning_rate": 0.003, + "loss": 4.067, + "step": 19678 + }, + { + "epoch": 0.19679, + "grad_norm": 0.8831079853084167, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 19679 + }, + { + "epoch": 0.1968, + "grad_norm": 0.8468692919029652, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 19680 + }, + { + "epoch": 0.19681, + "grad_norm": 0.7368182715685371, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 19681 + }, + { + "epoch": 0.19682, + "grad_norm": 0.7474720761224553, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 19682 + }, + { + "epoch": 0.19683, + "grad_norm": 0.7968755783528199, + "learning_rate": 0.003, + "loss": 4.059, + "step": 19683 + }, + { + "epoch": 0.19684, + "grad_norm": 0.8302497756578229, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 19684 + }, + { + "epoch": 0.19685, + "grad_norm": 0.8083807470510587, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 19685 + }, + { + "epoch": 0.19686, + "grad_norm": 0.6840671889461788, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 19686 + }, + { + "epoch": 0.19687, + "grad_norm": 0.6379636062580218, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 19687 + }, + { + "epoch": 0.19688, + "grad_norm": 0.6755767538822668, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 19688 + }, + { + "epoch": 0.19689, + "grad_norm": 0.6155383828889599, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 19689 + }, + { + "epoch": 0.1969, + "grad_norm": 0.611227770767368, + "learning_rate": 0.003, + "loss": 4.072, + "step": 19690 + }, + { + "epoch": 0.19691, + "grad_norm": 0.8032269452447673, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 19691 + }, + { + "epoch": 0.19692, + "grad_norm": 1.198196493639563, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 19692 + }, + { + "epoch": 0.19693, + "grad_norm": 1.1955061577906123, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 19693 + }, + { + "epoch": 0.19694, + "grad_norm": 0.8579327325889964, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 19694 + }, + { + "epoch": 0.19695, + "grad_norm": 0.7537554169929473, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 19695 + }, + { + "epoch": 0.19696, + "grad_norm": 0.7589037687157838, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 19696 + }, + { + "epoch": 0.19697, + "grad_norm": 0.810450154172506, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 19697 + }, + { + "epoch": 0.19698, + "grad_norm": 0.7526534961346523, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 19698 + }, + { + "epoch": 0.19699, + "grad_norm": 0.6561645834753397, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 19699 + }, + { + "epoch": 0.197, + "grad_norm": 0.8700640046075444, + "learning_rate": 0.003, + "loss": 4.059, + "step": 19700 + }, + { + "epoch": 0.19701, + "grad_norm": 1.258262587953789, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 19701 + }, + { + "epoch": 0.19702, + "grad_norm": 1.0040667850909977, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 19702 + }, + { + "epoch": 0.19703, + "grad_norm": 0.7556615313818323, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 19703 + }, + { + "epoch": 0.19704, + "grad_norm": 0.5976184252835506, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 19704 + }, + { + "epoch": 0.19705, + "grad_norm": 0.650572677181974, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 19705 + }, + { + "epoch": 0.19706, + "grad_norm": 0.7443194742393572, + "learning_rate": 0.003, + "loss": 4.014, + "step": 19706 + }, + { + "epoch": 0.19707, + "grad_norm": 0.9616357129827846, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 19707 + }, + { + "epoch": 0.19708, + "grad_norm": 1.1756003143638607, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 19708 + }, + { + "epoch": 0.19709, + "grad_norm": 0.8357305268406774, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 19709 + }, + { + "epoch": 0.1971, + "grad_norm": 0.824451659941323, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 19710 + }, + { + "epoch": 0.19711, + "grad_norm": 0.7773752404696846, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 19711 + }, + { + "epoch": 0.19712, + "grad_norm": 0.8012507574969546, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 19712 + }, + { + "epoch": 0.19713, + "grad_norm": 0.9489867745855541, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 19713 + }, + { + "epoch": 0.19714, + "grad_norm": 1.2844531029883166, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 19714 + }, + { + "epoch": 0.19715, + "grad_norm": 0.8908589160181729, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 19715 + }, + { + "epoch": 0.19716, + "grad_norm": 0.9580833767775246, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 19716 + }, + { + "epoch": 0.19717, + "grad_norm": 1.1680224981426544, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 19717 + }, + { + "epoch": 0.19718, + "grad_norm": 0.9140874492684244, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 19718 + }, + { + "epoch": 0.19719, + "grad_norm": 0.919575110954966, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 19719 + }, + { + "epoch": 0.1972, + "grad_norm": 1.0296479937908731, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 19720 + }, + { + "epoch": 0.19721, + "grad_norm": 1.073998288315665, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 19721 + }, + { + "epoch": 0.19722, + "grad_norm": 0.9543246554641899, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 19722 + }, + { + "epoch": 0.19723, + "grad_norm": 1.028807164252435, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 19723 + }, + { + "epoch": 0.19724, + "grad_norm": 1.150989933268326, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 19724 + }, + { + "epoch": 0.19725, + "grad_norm": 0.8941327796303788, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 19725 + }, + { + "epoch": 0.19726, + "grad_norm": 0.8978539867956097, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 19726 + }, + { + "epoch": 0.19727, + "grad_norm": 0.9735069665033468, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 19727 + }, + { + "epoch": 0.19728, + "grad_norm": 1.0393834033535847, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 19728 + }, + { + "epoch": 0.19729, + "grad_norm": 1.0101489645821173, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 19729 + }, + { + "epoch": 0.1973, + "grad_norm": 0.8446477851013826, + "learning_rate": 0.003, + "loss": 4.074, + "step": 19730 + }, + { + "epoch": 0.19731, + "grad_norm": 0.8508381578061052, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 19731 + }, + { + "epoch": 0.19732, + "grad_norm": 0.8852094200145928, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 19732 + }, + { + "epoch": 0.19733, + "grad_norm": 0.8712907379011499, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 19733 + }, + { + "epoch": 0.19734, + "grad_norm": 0.9957658734056823, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 19734 + }, + { + "epoch": 0.19735, + "grad_norm": 1.1041746410416118, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 19735 + }, + { + "epoch": 0.19736, + "grad_norm": 1.0030570397593415, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 19736 + }, + { + "epoch": 0.19737, + "grad_norm": 1.0849763446626417, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 19737 + }, + { + "epoch": 0.19738, + "grad_norm": 1.0089892081159848, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 19738 + }, + { + "epoch": 0.19739, + "grad_norm": 0.9486442335835147, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 19739 + }, + { + "epoch": 0.1974, + "grad_norm": 0.9928907586099743, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 19740 + }, + { + "epoch": 0.19741, + "grad_norm": 0.9317527123033049, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 19741 + }, + { + "epoch": 0.19742, + "grad_norm": 0.9604994076147799, + "learning_rate": 0.003, + "loss": 4.058, + "step": 19742 + }, + { + "epoch": 0.19743, + "grad_norm": 0.9723372857862396, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 19743 + }, + { + "epoch": 0.19744, + "grad_norm": 1.003076679951651, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 19744 + }, + { + "epoch": 0.19745, + "grad_norm": 0.9745238325892046, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 19745 + }, + { + "epoch": 0.19746, + "grad_norm": 0.952883463657387, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 19746 + }, + { + "epoch": 0.19747, + "grad_norm": 0.7611905659456139, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 19747 + }, + { + "epoch": 0.19748, + "grad_norm": 0.6188209555231131, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 19748 + }, + { + "epoch": 0.19749, + "grad_norm": 0.7040836631450551, + "learning_rate": 0.003, + "loss": 4.058, + "step": 19749 + }, + { + "epoch": 0.1975, + "grad_norm": 0.6293627760012084, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 19750 + }, + { + "epoch": 0.19751, + "grad_norm": 0.5887160146363231, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 19751 + }, + { + "epoch": 0.19752, + "grad_norm": 0.559532344798392, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 19752 + }, + { + "epoch": 0.19753, + "grad_norm": 0.5352111996411758, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 19753 + }, + { + "epoch": 0.19754, + "grad_norm": 0.6066195478172508, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 19754 + }, + { + "epoch": 0.19755, + "grad_norm": 0.6629454218326679, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 19755 + }, + { + "epoch": 0.19756, + "grad_norm": 0.781823368244423, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 19756 + }, + { + "epoch": 0.19757, + "grad_norm": 0.9193894567332758, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 19757 + }, + { + "epoch": 0.19758, + "grad_norm": 1.1038639820589233, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 19758 + }, + { + "epoch": 0.19759, + "grad_norm": 0.9780960388395343, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 19759 + }, + { + "epoch": 0.1976, + "grad_norm": 0.9927967189575263, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 19760 + }, + { + "epoch": 0.19761, + "grad_norm": 1.1180090639873808, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 19761 + }, + { + "epoch": 0.19762, + "grad_norm": 0.9732835515636639, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 19762 + }, + { + "epoch": 0.19763, + "grad_norm": 0.9175308228768257, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 19763 + }, + { + "epoch": 0.19764, + "grad_norm": 0.8394048070192874, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 19764 + }, + { + "epoch": 0.19765, + "grad_norm": 0.9225166966022803, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 19765 + }, + { + "epoch": 0.19766, + "grad_norm": 1.009090741829306, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 19766 + }, + { + "epoch": 0.19767, + "grad_norm": 1.2512525657798192, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 19767 + }, + { + "epoch": 0.19768, + "grad_norm": 1.0024586200818144, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 19768 + }, + { + "epoch": 0.19769, + "grad_norm": 0.9202985172242644, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 19769 + }, + { + "epoch": 0.1977, + "grad_norm": 0.8682879533663649, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 19770 + }, + { + "epoch": 0.19771, + "grad_norm": 0.8300298620182743, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 19771 + }, + { + "epoch": 0.19772, + "grad_norm": 0.8936461475002073, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 19772 + }, + { + "epoch": 0.19773, + "grad_norm": 0.9616423495584655, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 19773 + }, + { + "epoch": 0.19774, + "grad_norm": 0.8663505610530289, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 19774 + }, + { + "epoch": 0.19775, + "grad_norm": 0.88791514089382, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 19775 + }, + { + "epoch": 0.19776, + "grad_norm": 0.9869297135387154, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 19776 + }, + { + "epoch": 0.19777, + "grad_norm": 0.9167075446875883, + "learning_rate": 0.003, + "loss": 4.038, + "step": 19777 + }, + { + "epoch": 0.19778, + "grad_norm": 0.7190250293344528, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 19778 + }, + { + "epoch": 0.19779, + "grad_norm": 0.6441673767575137, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 19779 + }, + { + "epoch": 0.1978, + "grad_norm": 0.643983411523129, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 19780 + }, + { + "epoch": 0.19781, + "grad_norm": 0.7175178029511101, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 19781 + }, + { + "epoch": 0.19782, + "grad_norm": 0.8329549253153612, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 19782 + }, + { + "epoch": 0.19783, + "grad_norm": 0.9424467432848166, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 19783 + }, + { + "epoch": 0.19784, + "grad_norm": 1.1536430734054381, + "learning_rate": 0.003, + "loss": 4.074, + "step": 19784 + }, + { + "epoch": 0.19785, + "grad_norm": 0.8073132228902993, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 19785 + }, + { + "epoch": 0.19786, + "grad_norm": 0.8199887374947842, + "learning_rate": 0.003, + "loss": 4.055, + "step": 19786 + }, + { + "epoch": 0.19787, + "grad_norm": 0.8804475781197203, + "learning_rate": 0.003, + "loss": 4.055, + "step": 19787 + }, + { + "epoch": 0.19788, + "grad_norm": 1.031976531151439, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 19788 + }, + { + "epoch": 0.19789, + "grad_norm": 1.287596305378352, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 19789 + }, + { + "epoch": 0.1979, + "grad_norm": 0.6316533332578996, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 19790 + }, + { + "epoch": 0.19791, + "grad_norm": 0.7994082318587541, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 19791 + }, + { + "epoch": 0.19792, + "grad_norm": 0.9152851017072172, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 19792 + }, + { + "epoch": 0.19793, + "grad_norm": 0.9140567040182891, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 19793 + }, + { + "epoch": 0.19794, + "grad_norm": 0.8925892623813582, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 19794 + }, + { + "epoch": 0.19795, + "grad_norm": 0.9151546478166942, + "learning_rate": 0.003, + "loss": 4.065, + "step": 19795 + }, + { + "epoch": 0.19796, + "grad_norm": 1.046776094430734, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 19796 + }, + { + "epoch": 0.19797, + "grad_norm": 1.0861390961888773, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 19797 + }, + { + "epoch": 0.19798, + "grad_norm": 1.0465579906692883, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 19798 + }, + { + "epoch": 0.19799, + "grad_norm": 1.0146469760986077, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 19799 + }, + { + "epoch": 0.198, + "grad_norm": 0.9964749861423164, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 19800 + }, + { + "epoch": 0.19801, + "grad_norm": 0.9823527901657623, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 19801 + }, + { + "epoch": 0.19802, + "grad_norm": 0.9879118978824735, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 19802 + }, + { + "epoch": 0.19803, + "grad_norm": 1.0018465737365319, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 19803 + }, + { + "epoch": 0.19804, + "grad_norm": 1.0920266169684745, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 19804 + }, + { + "epoch": 0.19805, + "grad_norm": 0.9443044636960449, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 19805 + }, + { + "epoch": 0.19806, + "grad_norm": 0.9999826116147256, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 19806 + }, + { + "epoch": 0.19807, + "grad_norm": 1.0618088952745237, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 19807 + }, + { + "epoch": 0.19808, + "grad_norm": 1.001497051288458, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 19808 + }, + { + "epoch": 0.19809, + "grad_norm": 1.0528528445282366, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 19809 + }, + { + "epoch": 0.1981, + "grad_norm": 0.8955237049017132, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 19810 + }, + { + "epoch": 0.19811, + "grad_norm": 0.7835393392828128, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 19811 + }, + { + "epoch": 0.19812, + "grad_norm": 0.8973380312443668, + "learning_rate": 0.003, + "loss": 4.043, + "step": 19812 + }, + { + "epoch": 0.19813, + "grad_norm": 1.081027770683145, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 19813 + }, + { + "epoch": 0.19814, + "grad_norm": 1.141195561051589, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 19814 + }, + { + "epoch": 0.19815, + "grad_norm": 0.844089372192303, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 19815 + }, + { + "epoch": 0.19816, + "grad_norm": 0.7989452218297074, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 19816 + }, + { + "epoch": 0.19817, + "grad_norm": 0.8138069963141369, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 19817 + }, + { + "epoch": 0.19818, + "grad_norm": 0.8574226539254926, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 19818 + }, + { + "epoch": 0.19819, + "grad_norm": 0.7977803039052026, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 19819 + }, + { + "epoch": 0.1982, + "grad_norm": 0.752007945666223, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 19820 + }, + { + "epoch": 0.19821, + "grad_norm": 0.8180053670746039, + "learning_rate": 0.003, + "loss": 4.063, + "step": 19821 + }, + { + "epoch": 0.19822, + "grad_norm": 0.9991391048961866, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 19822 + }, + { + "epoch": 0.19823, + "grad_norm": 1.1112560788388506, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 19823 + }, + { + "epoch": 0.19824, + "grad_norm": 1.0015471589835643, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 19824 + }, + { + "epoch": 0.19825, + "grad_norm": 0.8825345688164498, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 19825 + }, + { + "epoch": 0.19826, + "grad_norm": 0.6901532947233038, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 19826 + }, + { + "epoch": 0.19827, + "grad_norm": 0.6033120091223885, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 19827 + }, + { + "epoch": 0.19828, + "grad_norm": 0.6079825563637126, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 19828 + }, + { + "epoch": 0.19829, + "grad_norm": 0.6561029843983504, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 19829 + }, + { + "epoch": 0.1983, + "grad_norm": 0.7168356167290718, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 19830 + }, + { + "epoch": 0.19831, + "grad_norm": 0.8027757272656064, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 19831 + }, + { + "epoch": 0.19832, + "grad_norm": 0.7531498764422621, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 19832 + }, + { + "epoch": 0.19833, + "grad_norm": 0.5724504129836073, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 19833 + }, + { + "epoch": 0.19834, + "grad_norm": 0.5417026494312349, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 19834 + }, + { + "epoch": 0.19835, + "grad_norm": 0.6664803251147037, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 19835 + }, + { + "epoch": 0.19836, + "grad_norm": 0.8605774550359709, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 19836 + }, + { + "epoch": 0.19837, + "grad_norm": 0.9613068224672485, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 19837 + }, + { + "epoch": 0.19838, + "grad_norm": 1.034405634544546, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 19838 + }, + { + "epoch": 0.19839, + "grad_norm": 1.218292370650084, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 19839 + }, + { + "epoch": 0.1984, + "grad_norm": 0.7228430867683359, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 19840 + }, + { + "epoch": 0.19841, + "grad_norm": 0.7155503591441723, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 19841 + }, + { + "epoch": 0.19842, + "grad_norm": 0.7949137251277599, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 19842 + }, + { + "epoch": 0.19843, + "grad_norm": 1.0480441969647263, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 19843 + }, + { + "epoch": 0.19844, + "grad_norm": 1.1111687513827093, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 19844 + }, + { + "epoch": 0.19845, + "grad_norm": 0.8451794233928357, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 19845 + }, + { + "epoch": 0.19846, + "grad_norm": 0.8571616278383336, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 19846 + }, + { + "epoch": 0.19847, + "grad_norm": 0.8407128714816388, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 19847 + }, + { + "epoch": 0.19848, + "grad_norm": 1.0110269823089972, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 19848 + }, + { + "epoch": 0.19849, + "grad_norm": 1.2944775759063354, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 19849 + }, + { + "epoch": 0.1985, + "grad_norm": 0.7967670539514138, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 19850 + }, + { + "epoch": 0.19851, + "grad_norm": 0.8151984717720282, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 19851 + }, + { + "epoch": 0.19852, + "grad_norm": 0.9428974553924526, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 19852 + }, + { + "epoch": 0.19853, + "grad_norm": 1.0652837128197106, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 19853 + }, + { + "epoch": 0.19854, + "grad_norm": 0.9955810564174555, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 19854 + }, + { + "epoch": 0.19855, + "grad_norm": 1.0103477292459686, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 19855 + }, + { + "epoch": 0.19856, + "grad_norm": 0.9197432984947649, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 19856 + }, + { + "epoch": 0.19857, + "grad_norm": 0.9606376147101987, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 19857 + }, + { + "epoch": 0.19858, + "grad_norm": 1.0384020553475217, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 19858 + }, + { + "epoch": 0.19859, + "grad_norm": 1.0988358737331458, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 19859 + }, + { + "epoch": 0.1986, + "grad_norm": 1.3451903501613105, + "learning_rate": 0.003, + "loss": 4.079, + "step": 19860 + }, + { + "epoch": 0.19861, + "grad_norm": 0.8881360761180166, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 19861 + }, + { + "epoch": 0.19862, + "grad_norm": 0.8660717523920124, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 19862 + }, + { + "epoch": 0.19863, + "grad_norm": 0.9976570646768376, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 19863 + }, + { + "epoch": 0.19864, + "grad_norm": 0.9848723273395584, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 19864 + }, + { + "epoch": 0.19865, + "grad_norm": 0.8934798896098098, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 19865 + }, + { + "epoch": 0.19866, + "grad_norm": 0.8913654835852564, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 19866 + }, + { + "epoch": 0.19867, + "grad_norm": 0.9182084049802524, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 19867 + }, + { + "epoch": 0.19868, + "grad_norm": 0.9657264721270611, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 19868 + }, + { + "epoch": 0.19869, + "grad_norm": 0.957293500686211, + "learning_rate": 0.003, + "loss": 4.076, + "step": 19869 + }, + { + "epoch": 0.1987, + "grad_norm": 1.0365374160034644, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 19870 + }, + { + "epoch": 0.19871, + "grad_norm": 1.179155935998019, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 19871 + }, + { + "epoch": 0.19872, + "grad_norm": 0.9142861225462765, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 19872 + }, + { + "epoch": 0.19873, + "grad_norm": 0.7806002618662166, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 19873 + }, + { + "epoch": 0.19874, + "grad_norm": 0.6421677922073148, + "learning_rate": 0.003, + "loss": 4.066, + "step": 19874 + }, + { + "epoch": 0.19875, + "grad_norm": 0.5749383292252597, + "learning_rate": 0.003, + "loss": 4.042, + "step": 19875 + }, + { + "epoch": 0.19876, + "grad_norm": 0.6521724037453608, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 19876 + }, + { + "epoch": 0.19877, + "grad_norm": 0.8131557022513219, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 19877 + }, + { + "epoch": 0.19878, + "grad_norm": 0.8139523164329852, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 19878 + }, + { + "epoch": 0.19879, + "grad_norm": 0.8487393099543807, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 19879 + }, + { + "epoch": 0.1988, + "grad_norm": 0.8453183984430512, + "learning_rate": 0.003, + "loss": 4.055, + "step": 19880 + }, + { + "epoch": 0.19881, + "grad_norm": 0.8421102454998768, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 19881 + }, + { + "epoch": 0.19882, + "grad_norm": 0.71154179744596, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 19882 + }, + { + "epoch": 0.19883, + "grad_norm": 0.8159281335825941, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 19883 + }, + { + "epoch": 0.19884, + "grad_norm": 1.0723499246928387, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 19884 + }, + { + "epoch": 0.19885, + "grad_norm": 1.2251720973359979, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 19885 + }, + { + "epoch": 0.19886, + "grad_norm": 0.7352708327150798, + "learning_rate": 0.003, + "loss": 4.037, + "step": 19886 + }, + { + "epoch": 0.19887, + "grad_norm": 0.6677081852101372, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 19887 + }, + { + "epoch": 0.19888, + "grad_norm": 0.8420646776254377, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 19888 + }, + { + "epoch": 0.19889, + "grad_norm": 1.0181240476962217, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 19889 + }, + { + "epoch": 0.1989, + "grad_norm": 1.0490571071709465, + "learning_rate": 0.003, + "loss": 4.076, + "step": 19890 + }, + { + "epoch": 0.19891, + "grad_norm": 0.8728379339581817, + "learning_rate": 0.003, + "loss": 4.057, + "step": 19891 + }, + { + "epoch": 0.19892, + "grad_norm": 0.8813000974001235, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 19892 + }, + { + "epoch": 0.19893, + "grad_norm": 0.7264968031599618, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 19893 + }, + { + "epoch": 0.19894, + "grad_norm": 0.7250654055700293, + "learning_rate": 0.003, + "loss": 4.05, + "step": 19894 + }, + { + "epoch": 0.19895, + "grad_norm": 0.7702901389350747, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 19895 + }, + { + "epoch": 0.19896, + "grad_norm": 0.8909511427953801, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 19896 + }, + { + "epoch": 0.19897, + "grad_norm": 0.9173829257595774, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 19897 + }, + { + "epoch": 0.19898, + "grad_norm": 0.8842631804974519, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 19898 + }, + { + "epoch": 0.19899, + "grad_norm": 0.7923682624901234, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 19899 + }, + { + "epoch": 0.199, + "grad_norm": 0.7539960971375723, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 19900 + }, + { + "epoch": 0.19901, + "grad_norm": 0.703005987116502, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 19901 + }, + { + "epoch": 0.19902, + "grad_norm": 0.8009496531249488, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 19902 + }, + { + "epoch": 0.19903, + "grad_norm": 0.9067062801110316, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 19903 + }, + { + "epoch": 0.19904, + "grad_norm": 1.1786764927253293, + "learning_rate": 0.003, + "loss": 4.066, + "step": 19904 + }, + { + "epoch": 0.19905, + "grad_norm": 1.1361924670089292, + "learning_rate": 0.003, + "loss": 4.067, + "step": 19905 + }, + { + "epoch": 0.19906, + "grad_norm": 1.0529521477198889, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 19906 + }, + { + "epoch": 0.19907, + "grad_norm": 1.0046040807412195, + "learning_rate": 0.003, + "loss": 4.071, + "step": 19907 + }, + { + "epoch": 0.19908, + "grad_norm": 1.0326223487037012, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 19908 + }, + { + "epoch": 0.19909, + "grad_norm": 0.8737179046502894, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 19909 + }, + { + "epoch": 0.1991, + "grad_norm": 0.890611080041377, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 19910 + }, + { + "epoch": 0.19911, + "grad_norm": 1.0069355533898776, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 19911 + }, + { + "epoch": 0.19912, + "grad_norm": 1.2056728238597598, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 19912 + }, + { + "epoch": 0.19913, + "grad_norm": 0.9177299198604985, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 19913 + }, + { + "epoch": 0.19914, + "grad_norm": 1.1338328387087382, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 19914 + }, + { + "epoch": 0.19915, + "grad_norm": 1.136958933495833, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 19915 + }, + { + "epoch": 0.19916, + "grad_norm": 0.8991149258341761, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 19916 + }, + { + "epoch": 0.19917, + "grad_norm": 0.8335027715715886, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 19917 + }, + { + "epoch": 0.19918, + "grad_norm": 0.9043135139359022, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 19918 + }, + { + "epoch": 0.19919, + "grad_norm": 0.9987739405236165, + "learning_rate": 0.003, + "loss": 4.064, + "step": 19919 + }, + { + "epoch": 0.1992, + "grad_norm": 0.9957258338930796, + "learning_rate": 0.003, + "loss": 4.082, + "step": 19920 + }, + { + "epoch": 0.19921, + "grad_norm": 0.8952638043759665, + "learning_rate": 0.003, + "loss": 4.076, + "step": 19921 + }, + { + "epoch": 0.19922, + "grad_norm": 0.8106292391350349, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 19922 + }, + { + "epoch": 0.19923, + "grad_norm": 0.7809987620556, + "learning_rate": 0.003, + "loss": 4.079, + "step": 19923 + }, + { + "epoch": 0.19924, + "grad_norm": 0.635369095839311, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 19924 + }, + { + "epoch": 0.19925, + "grad_norm": 0.585805181251373, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 19925 + }, + { + "epoch": 0.19926, + "grad_norm": 0.5323816818131343, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 19926 + }, + { + "epoch": 0.19927, + "grad_norm": 0.49650199814277274, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 19927 + }, + { + "epoch": 0.19928, + "grad_norm": 0.479325318995144, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 19928 + }, + { + "epoch": 0.19929, + "grad_norm": 0.5299288043658947, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 19929 + }, + { + "epoch": 0.1993, + "grad_norm": 0.6228564791277241, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 19930 + }, + { + "epoch": 0.19931, + "grad_norm": 0.8395177636113068, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 19931 + }, + { + "epoch": 0.19932, + "grad_norm": 1.2770943951623002, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 19932 + }, + { + "epoch": 0.19933, + "grad_norm": 0.7990604435104527, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 19933 + }, + { + "epoch": 0.19934, + "grad_norm": 0.7105284669586823, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 19934 + }, + { + "epoch": 0.19935, + "grad_norm": 0.861766218815007, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 19935 + }, + { + "epoch": 0.19936, + "grad_norm": 1.0003200038958466, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 19936 + }, + { + "epoch": 0.19937, + "grad_norm": 1.1483690283132395, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 19937 + }, + { + "epoch": 0.19938, + "grad_norm": 0.9464365396234878, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 19938 + }, + { + "epoch": 0.19939, + "grad_norm": 0.8942595361201251, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 19939 + }, + { + "epoch": 0.1994, + "grad_norm": 0.8395261997307861, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 19940 + }, + { + "epoch": 0.19941, + "grad_norm": 0.8439607063196037, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 19941 + }, + { + "epoch": 0.19942, + "grad_norm": 0.7949157567469377, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 19942 + }, + { + "epoch": 0.19943, + "grad_norm": 0.7837447744102928, + "learning_rate": 0.003, + "loss": 4.077, + "step": 19943 + }, + { + "epoch": 0.19944, + "grad_norm": 0.8196648127077458, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 19944 + }, + { + "epoch": 0.19945, + "grad_norm": 0.847908100014604, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 19945 + }, + { + "epoch": 0.19946, + "grad_norm": 0.7954309923061973, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 19946 + }, + { + "epoch": 0.19947, + "grad_norm": 0.8829725129318641, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 19947 + }, + { + "epoch": 0.19948, + "grad_norm": 1.154184820109059, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 19948 + }, + { + "epoch": 0.19949, + "grad_norm": 1.2474068658566835, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 19949 + }, + { + "epoch": 0.1995, + "grad_norm": 0.7853262660096543, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 19950 + }, + { + "epoch": 0.19951, + "grad_norm": 0.6203693143297675, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 19951 + }, + { + "epoch": 0.19952, + "grad_norm": 0.641965893367166, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 19952 + }, + { + "epoch": 0.19953, + "grad_norm": 0.7600255241031556, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 19953 + }, + { + "epoch": 0.19954, + "grad_norm": 0.9000486167165967, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 19954 + }, + { + "epoch": 0.19955, + "grad_norm": 1.0178304320588876, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 19955 + }, + { + "epoch": 0.19956, + "grad_norm": 1.0722741876495072, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 19956 + }, + { + "epoch": 0.19957, + "grad_norm": 0.8604334677789589, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 19957 + }, + { + "epoch": 0.19958, + "grad_norm": 0.725246645570351, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 19958 + }, + { + "epoch": 0.19959, + "grad_norm": 0.8575132762287329, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 19959 + }, + { + "epoch": 0.1996, + "grad_norm": 1.0150048930227236, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 19960 + }, + { + "epoch": 0.19961, + "grad_norm": 1.0465999661583107, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 19961 + }, + { + "epoch": 0.19962, + "grad_norm": 0.954332165206631, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 19962 + }, + { + "epoch": 0.19963, + "grad_norm": 0.9826237662598315, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 19963 + }, + { + "epoch": 0.19964, + "grad_norm": 1.0495364764278192, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 19964 + }, + { + "epoch": 0.19965, + "grad_norm": 0.9288997451281783, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 19965 + }, + { + "epoch": 0.19966, + "grad_norm": 0.924625144905568, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 19966 + }, + { + "epoch": 0.19967, + "grad_norm": 0.7039083317237521, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 19967 + }, + { + "epoch": 0.19968, + "grad_norm": 0.7667125873925056, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 19968 + }, + { + "epoch": 0.19969, + "grad_norm": 0.8032642139561513, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 19969 + }, + { + "epoch": 0.1997, + "grad_norm": 0.8634620299362555, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 19970 + }, + { + "epoch": 0.19971, + "grad_norm": 0.8711120377179361, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 19971 + }, + { + "epoch": 0.19972, + "grad_norm": 0.8219934537551538, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 19972 + }, + { + "epoch": 0.19973, + "grad_norm": 0.9277090498631383, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 19973 + }, + { + "epoch": 0.19974, + "grad_norm": 1.0010322521579693, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 19974 + }, + { + "epoch": 0.19975, + "grad_norm": 1.0754248434811629, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 19975 + }, + { + "epoch": 0.19976, + "grad_norm": 0.9996759123531492, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 19976 + }, + { + "epoch": 0.19977, + "grad_norm": 1.0835730332822093, + "learning_rate": 0.003, + "loss": 4.029, + "step": 19977 + }, + { + "epoch": 0.19978, + "grad_norm": 1.1182571439294586, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 19978 + }, + { + "epoch": 0.19979, + "grad_norm": 0.9474921357436006, + "learning_rate": 0.003, + "loss": 4.064, + "step": 19979 + }, + { + "epoch": 0.1998, + "grad_norm": 1.0688651065177377, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 19980 + }, + { + "epoch": 0.19981, + "grad_norm": 1.0509467419193252, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 19981 + }, + { + "epoch": 0.19982, + "grad_norm": 1.0361447387297629, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 19982 + }, + { + "epoch": 0.19983, + "grad_norm": 1.0118352507968307, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 19983 + }, + { + "epoch": 0.19984, + "grad_norm": 0.7224112297187693, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 19984 + }, + { + "epoch": 0.19985, + "grad_norm": 0.6682364334954591, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 19985 + }, + { + "epoch": 0.19986, + "grad_norm": 0.7095733709866191, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 19986 + }, + { + "epoch": 0.19987, + "grad_norm": 0.6131987616027459, + "learning_rate": 0.003, + "loss": 4.023, + "step": 19987 + }, + { + "epoch": 0.19988, + "grad_norm": 0.6676935999382909, + "learning_rate": 0.003, + "loss": 4.0007, + "step": 19988 + }, + { + "epoch": 0.19989, + "grad_norm": 0.7968219762064099, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 19989 + }, + { + "epoch": 0.1999, + "grad_norm": 0.9445274498411697, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 19990 + }, + { + "epoch": 0.19991, + "grad_norm": 1.0782156526501272, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 19991 + }, + { + "epoch": 0.19992, + "grad_norm": 0.8426083373781742, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 19992 + }, + { + "epoch": 0.19993, + "grad_norm": 0.8047996806530708, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 19993 + }, + { + "epoch": 0.19994, + "grad_norm": 0.856081906243602, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 19994 + }, + { + "epoch": 0.19995, + "grad_norm": 0.8403147233563304, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 19995 + }, + { + "epoch": 0.19996, + "grad_norm": 0.845467173435506, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 19996 + }, + { + "epoch": 0.19997, + "grad_norm": 0.6845837412725747, + "learning_rate": 0.003, + "loss": 4.031, + "step": 19997 + }, + { + "epoch": 0.19998, + "grad_norm": 0.6789030617477443, + "learning_rate": 0.003, + "loss": 4.065, + "step": 19998 + }, + { + "epoch": 0.19999, + "grad_norm": 0.5958043457351749, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 19999 + }, + { + "epoch": 0.2, + "grad_norm": 0.6026949561857949, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 20000 + }, + { + "epoch": 0.20001, + "grad_norm": 0.6327553979490099, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 20001 + }, + { + "epoch": 0.20002, + "grad_norm": 0.7678679790990631, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 20002 + }, + { + "epoch": 0.20003, + "grad_norm": 1.1010005031709966, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 20003 + }, + { + "epoch": 0.20004, + "grad_norm": 1.1403031258403544, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 20004 + }, + { + "epoch": 0.20005, + "grad_norm": 0.9623742236138284, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 20005 + }, + { + "epoch": 0.20006, + "grad_norm": 0.9806403495556745, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 20006 + }, + { + "epoch": 0.20007, + "grad_norm": 1.0725467394031813, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 20007 + }, + { + "epoch": 0.20008, + "grad_norm": 1.0881138286320984, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 20008 + }, + { + "epoch": 0.20009, + "grad_norm": 1.0718917931095369, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 20009 + }, + { + "epoch": 0.2001, + "grad_norm": 0.914771954112482, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 20010 + }, + { + "epoch": 0.20011, + "grad_norm": 0.9102671702500844, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 20011 + }, + { + "epoch": 0.20012, + "grad_norm": 0.985939701703711, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 20012 + }, + { + "epoch": 0.20013, + "grad_norm": 1.0456172407348348, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 20013 + }, + { + "epoch": 0.20014, + "grad_norm": 0.9093804870690748, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 20014 + }, + { + "epoch": 0.20015, + "grad_norm": 0.9709485483908908, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 20015 + }, + { + "epoch": 0.20016, + "grad_norm": 1.0131604480684195, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 20016 + }, + { + "epoch": 0.20017, + "grad_norm": 1.2173170340003923, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 20017 + }, + { + "epoch": 0.20018, + "grad_norm": 0.9647964027838721, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 20018 + }, + { + "epoch": 0.20019, + "grad_norm": 0.9754702722627663, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 20019 + }, + { + "epoch": 0.2002, + "grad_norm": 1.1954329106768795, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 20020 + }, + { + "epoch": 0.20021, + "grad_norm": 0.8274657349245227, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 20021 + }, + { + "epoch": 0.20022, + "grad_norm": 0.6823130708264915, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 20022 + }, + { + "epoch": 0.20023, + "grad_norm": 0.7110441144050427, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 20023 + }, + { + "epoch": 0.20024, + "grad_norm": 0.7601759184448256, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 20024 + }, + { + "epoch": 0.20025, + "grad_norm": 0.7828937579717817, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 20025 + }, + { + "epoch": 0.20026, + "grad_norm": 0.7425929457343321, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 20026 + }, + { + "epoch": 0.20027, + "grad_norm": 0.6487724460167308, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 20027 + }, + { + "epoch": 0.20028, + "grad_norm": 0.7666537863228957, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 20028 + }, + { + "epoch": 0.20029, + "grad_norm": 0.9761623573020568, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 20029 + }, + { + "epoch": 0.2003, + "grad_norm": 1.0205228779533229, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 20030 + }, + { + "epoch": 0.20031, + "grad_norm": 1.064011516280232, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 20031 + }, + { + "epoch": 0.20032, + "grad_norm": 1.070761784756568, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 20032 + }, + { + "epoch": 0.20033, + "grad_norm": 0.9109460012142986, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 20033 + }, + { + "epoch": 0.20034, + "grad_norm": 0.8399057251916916, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 20034 + }, + { + "epoch": 0.20035, + "grad_norm": 0.7273898263902544, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 20035 + }, + { + "epoch": 0.20036, + "grad_norm": 0.727788067922131, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 20036 + }, + { + "epoch": 0.20037, + "grad_norm": 0.7829675425223467, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 20037 + }, + { + "epoch": 0.20038, + "grad_norm": 0.7330935305324451, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 20038 + }, + { + "epoch": 0.20039, + "grad_norm": 0.7200919666213671, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 20039 + }, + { + "epoch": 0.2004, + "grad_norm": 0.8196275115286166, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 20040 + }, + { + "epoch": 0.20041, + "grad_norm": 0.928633667376276, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 20041 + }, + { + "epoch": 0.20042, + "grad_norm": 0.9762241536343469, + "learning_rate": 0.003, + "loss": 4.097, + "step": 20042 + }, + { + "epoch": 0.20043, + "grad_norm": 1.3046399047911965, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 20043 + }, + { + "epoch": 0.20044, + "grad_norm": 0.8898628750453046, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 20044 + }, + { + "epoch": 0.20045, + "grad_norm": 0.8665306581762685, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 20045 + }, + { + "epoch": 0.20046, + "grad_norm": 0.8626539112573827, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 20046 + }, + { + "epoch": 0.20047, + "grad_norm": 0.8990512212145599, + "learning_rate": 0.003, + "loss": 4.034, + "step": 20047 + }, + { + "epoch": 0.20048, + "grad_norm": 1.0182375116981142, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 20048 + }, + { + "epoch": 0.20049, + "grad_norm": 1.0085981594424702, + "learning_rate": 0.003, + "loss": 4.048, + "step": 20049 + }, + { + "epoch": 0.2005, + "grad_norm": 1.056161106794359, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 20050 + }, + { + "epoch": 0.20051, + "grad_norm": 0.980458659881766, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 20051 + }, + { + "epoch": 0.20052, + "grad_norm": 0.8637915953946603, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 20052 + }, + { + "epoch": 0.20053, + "grad_norm": 0.7982949488395766, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 20053 + }, + { + "epoch": 0.20054, + "grad_norm": 0.793427421107944, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 20054 + }, + { + "epoch": 0.20055, + "grad_norm": 0.8668356587530134, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 20055 + }, + { + "epoch": 0.20056, + "grad_norm": 0.8650457817405732, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 20056 + }, + { + "epoch": 0.20057, + "grad_norm": 0.9942552617357852, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 20057 + }, + { + "epoch": 0.20058, + "grad_norm": 1.0241433142808658, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 20058 + }, + { + "epoch": 0.20059, + "grad_norm": 1.0345981695309152, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 20059 + }, + { + "epoch": 0.2006, + "grad_norm": 0.977356780743453, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 20060 + }, + { + "epoch": 0.20061, + "grad_norm": 0.8544746775243427, + "learning_rate": 0.003, + "loss": 4.077, + "step": 20061 + }, + { + "epoch": 0.20062, + "grad_norm": 0.8178153506969954, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 20062 + }, + { + "epoch": 0.20063, + "grad_norm": 0.865809879265382, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 20063 + }, + { + "epoch": 0.20064, + "grad_norm": 0.832081819032671, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 20064 + }, + { + "epoch": 0.20065, + "grad_norm": 0.7994013015175561, + "learning_rate": 0.003, + "loss": 4.073, + "step": 20065 + }, + { + "epoch": 0.20066, + "grad_norm": 0.8227977247961066, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 20066 + }, + { + "epoch": 0.20067, + "grad_norm": 0.8007848694667724, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 20067 + }, + { + "epoch": 0.20068, + "grad_norm": 0.9951273072755855, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 20068 + }, + { + "epoch": 0.20069, + "grad_norm": 1.2166984111567118, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 20069 + }, + { + "epoch": 0.2007, + "grad_norm": 0.9194056298310658, + "learning_rate": 0.003, + "loss": 4.088, + "step": 20070 + }, + { + "epoch": 0.20071, + "grad_norm": 1.2245614817544845, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 20071 + }, + { + "epoch": 0.20072, + "grad_norm": 1.0794408169196437, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 20072 + }, + { + "epoch": 0.20073, + "grad_norm": 1.0177467952505859, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 20073 + }, + { + "epoch": 0.20074, + "grad_norm": 0.8641970977521174, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 20074 + }, + { + "epoch": 0.20075, + "grad_norm": 0.800882456548685, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 20075 + }, + { + "epoch": 0.20076, + "grad_norm": 0.8988311905748049, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 20076 + }, + { + "epoch": 0.20077, + "grad_norm": 0.966361374738778, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 20077 + }, + { + "epoch": 0.20078, + "grad_norm": 1.096992534192089, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 20078 + }, + { + "epoch": 0.20079, + "grad_norm": 0.9949261719149465, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 20079 + }, + { + "epoch": 0.2008, + "grad_norm": 0.9586607130399144, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 20080 + }, + { + "epoch": 0.20081, + "grad_norm": 0.9217083099859508, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 20081 + }, + { + "epoch": 0.20082, + "grad_norm": 0.8472852150035282, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 20082 + }, + { + "epoch": 0.20083, + "grad_norm": 0.7953677107390348, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 20083 + }, + { + "epoch": 0.20084, + "grad_norm": 0.680053715728845, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 20084 + }, + { + "epoch": 0.20085, + "grad_norm": 0.6585833058754239, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 20085 + }, + { + "epoch": 0.20086, + "grad_norm": 0.6413314084931637, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 20086 + }, + { + "epoch": 0.20087, + "grad_norm": 0.7212903936324376, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 20087 + }, + { + "epoch": 0.20088, + "grad_norm": 0.7845166610714689, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 20088 + }, + { + "epoch": 0.20089, + "grad_norm": 0.8679374978768807, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 20089 + }, + { + "epoch": 0.2009, + "grad_norm": 1.1454127256309485, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 20090 + }, + { + "epoch": 0.20091, + "grad_norm": 1.0632061020326555, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 20091 + }, + { + "epoch": 0.20092, + "grad_norm": 0.9064590068000318, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 20092 + }, + { + "epoch": 0.20093, + "grad_norm": 0.7263723515361191, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 20093 + }, + { + "epoch": 0.20094, + "grad_norm": 0.692241184637872, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 20094 + }, + { + "epoch": 0.20095, + "grad_norm": 0.7333801983406207, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 20095 + }, + { + "epoch": 0.20096, + "grad_norm": 0.794351284519141, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 20096 + }, + { + "epoch": 0.20097, + "grad_norm": 0.8389976820726525, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 20097 + }, + { + "epoch": 0.20098, + "grad_norm": 0.9503021293429343, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 20098 + }, + { + "epoch": 0.20099, + "grad_norm": 0.8926167320432626, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 20099 + }, + { + "epoch": 0.201, + "grad_norm": 1.0433992107615353, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 20100 + }, + { + "epoch": 0.20101, + "grad_norm": 1.0482758149776457, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 20101 + }, + { + "epoch": 0.20102, + "grad_norm": 1.0039374065501359, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 20102 + }, + { + "epoch": 0.20103, + "grad_norm": 1.1933143529367924, + "learning_rate": 0.003, + "loss": 4.072, + "step": 20103 + }, + { + "epoch": 0.20104, + "grad_norm": 0.9896487352504916, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 20104 + }, + { + "epoch": 0.20105, + "grad_norm": 1.2045369043709517, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 20105 + }, + { + "epoch": 0.20106, + "grad_norm": 0.9997509169271485, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 20106 + }, + { + "epoch": 0.20107, + "grad_norm": 1.062355001915111, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 20107 + }, + { + "epoch": 0.20108, + "grad_norm": 0.9383116181994937, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 20108 + }, + { + "epoch": 0.20109, + "grad_norm": 0.802970277897396, + "learning_rate": 0.003, + "loss": 4.075, + "step": 20109 + }, + { + "epoch": 0.2011, + "grad_norm": 0.8553015733607501, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 20110 + }, + { + "epoch": 0.20111, + "grad_norm": 0.9761260316599434, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 20111 + }, + { + "epoch": 0.20112, + "grad_norm": 1.1648781140489337, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 20112 + }, + { + "epoch": 0.20113, + "grad_norm": 0.884934332495618, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 20113 + }, + { + "epoch": 0.20114, + "grad_norm": 0.6920056517435634, + "learning_rate": 0.003, + "loss": 4.045, + "step": 20114 + }, + { + "epoch": 0.20115, + "grad_norm": 0.6035490806756311, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 20115 + }, + { + "epoch": 0.20116, + "grad_norm": 0.5843184361926623, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 20116 + }, + { + "epoch": 0.20117, + "grad_norm": 0.7327969657916124, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 20117 + }, + { + "epoch": 0.20118, + "grad_norm": 0.8113752494147356, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 20118 + }, + { + "epoch": 0.20119, + "grad_norm": 0.9418203424481393, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 20119 + }, + { + "epoch": 0.2012, + "grad_norm": 1.0395802115500303, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 20120 + }, + { + "epoch": 0.20121, + "grad_norm": 0.9336540311815786, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 20121 + }, + { + "epoch": 0.20122, + "grad_norm": 0.8491760428753012, + "learning_rate": 0.003, + "loss": 4.089, + "step": 20122 + }, + { + "epoch": 0.20123, + "grad_norm": 0.7735078244816492, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 20123 + }, + { + "epoch": 0.20124, + "grad_norm": 0.7511367894849116, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 20124 + }, + { + "epoch": 0.20125, + "grad_norm": 0.6969386030563182, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 20125 + }, + { + "epoch": 0.20126, + "grad_norm": 0.6999763773198828, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 20126 + }, + { + "epoch": 0.20127, + "grad_norm": 0.7262174914142938, + "learning_rate": 0.003, + "loss": 4.042, + "step": 20127 + }, + { + "epoch": 0.20128, + "grad_norm": 0.8739883979365946, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 20128 + }, + { + "epoch": 0.20129, + "grad_norm": 0.972758276404007, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 20129 + }, + { + "epoch": 0.2013, + "grad_norm": 1.0737773194009104, + "learning_rate": 0.003, + "loss": 4.08, + "step": 20130 + }, + { + "epoch": 0.20131, + "grad_norm": 0.9937402491948394, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 20131 + }, + { + "epoch": 0.20132, + "grad_norm": 0.9111727913298527, + "learning_rate": 0.003, + "loss": 4.076, + "step": 20132 + }, + { + "epoch": 0.20133, + "grad_norm": 0.7512459276066992, + "learning_rate": 0.003, + "loss": 4.049, + "step": 20133 + }, + { + "epoch": 0.20134, + "grad_norm": 0.7969020965012391, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 20134 + }, + { + "epoch": 0.20135, + "grad_norm": 0.7631114134969357, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 20135 + }, + { + "epoch": 0.20136, + "grad_norm": 0.8350155763338005, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 20136 + }, + { + "epoch": 0.20137, + "grad_norm": 0.9764309152862, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 20137 + }, + { + "epoch": 0.20138, + "grad_norm": 1.0119550487547355, + "learning_rate": 0.003, + "loss": 4.073, + "step": 20138 + }, + { + "epoch": 0.20139, + "grad_norm": 0.951657988230318, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 20139 + }, + { + "epoch": 0.2014, + "grad_norm": 0.9788969500181236, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 20140 + }, + { + "epoch": 0.20141, + "grad_norm": 0.9689570898898988, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 20141 + }, + { + "epoch": 0.20142, + "grad_norm": 1.300572130299451, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 20142 + }, + { + "epoch": 0.20143, + "grad_norm": 1.006274161308575, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 20143 + }, + { + "epoch": 0.20144, + "grad_norm": 0.8999385868838741, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 20144 + }, + { + "epoch": 0.20145, + "grad_norm": 0.9353114000308074, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 20145 + }, + { + "epoch": 0.20146, + "grad_norm": 1.0665073005313308, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 20146 + }, + { + "epoch": 0.20147, + "grad_norm": 1.0174843957750728, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 20147 + }, + { + "epoch": 0.20148, + "grad_norm": 0.9992685927973987, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 20148 + }, + { + "epoch": 0.20149, + "grad_norm": 0.9767025424438643, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 20149 + }, + { + "epoch": 0.2015, + "grad_norm": 1.001394768461815, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 20150 + }, + { + "epoch": 0.20151, + "grad_norm": 1.1170636149419098, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 20151 + }, + { + "epoch": 0.20152, + "grad_norm": 0.9104064337618925, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 20152 + }, + { + "epoch": 0.20153, + "grad_norm": 0.8642714880219691, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 20153 + }, + { + "epoch": 0.20154, + "grad_norm": 1.088692879262389, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 20154 + }, + { + "epoch": 0.20155, + "grad_norm": 1.0857414998387973, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 20155 + }, + { + "epoch": 0.20156, + "grad_norm": 1.157432911663517, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 20156 + }, + { + "epoch": 0.20157, + "grad_norm": 0.8833071071180393, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 20157 + }, + { + "epoch": 0.20158, + "grad_norm": 0.7915807017373842, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 20158 + }, + { + "epoch": 0.20159, + "grad_norm": 0.7512532350134129, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 20159 + }, + { + "epoch": 0.2016, + "grad_norm": 0.6932030260042161, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 20160 + }, + { + "epoch": 0.20161, + "grad_norm": 0.7001222311666129, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 20161 + }, + { + "epoch": 0.20162, + "grad_norm": 0.739884028849956, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 20162 + }, + { + "epoch": 0.20163, + "grad_norm": 0.8449774946380476, + "learning_rate": 0.003, + "loss": 4.07, + "step": 20163 + }, + { + "epoch": 0.20164, + "grad_norm": 0.9717015055538465, + "learning_rate": 0.003, + "loss": 4.098, + "step": 20164 + }, + { + "epoch": 0.20165, + "grad_norm": 1.1412913747142812, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 20165 + }, + { + "epoch": 0.20166, + "grad_norm": 1.0779748856212756, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 20166 + }, + { + "epoch": 0.20167, + "grad_norm": 1.0759243555954132, + "learning_rate": 0.003, + "loss": 4.04, + "step": 20167 + }, + { + "epoch": 0.20168, + "grad_norm": 1.0073448137041712, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 20168 + }, + { + "epoch": 0.20169, + "grad_norm": 0.9629958690270425, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 20169 + }, + { + "epoch": 0.2017, + "grad_norm": 0.9124721385627201, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 20170 + }, + { + "epoch": 0.20171, + "grad_norm": 0.8481950211652999, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 20171 + }, + { + "epoch": 0.20172, + "grad_norm": 0.8660288587365784, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 20172 + }, + { + "epoch": 0.20173, + "grad_norm": 0.9027104326908204, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 20173 + }, + { + "epoch": 0.20174, + "grad_norm": 0.854115571736065, + "learning_rate": 0.003, + "loss": 4.033, + "step": 20174 + }, + { + "epoch": 0.20175, + "grad_norm": 0.922048558283872, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 20175 + }, + { + "epoch": 0.20176, + "grad_norm": 0.8879094371694463, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 20176 + }, + { + "epoch": 0.20177, + "grad_norm": 0.8079154857394067, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 20177 + }, + { + "epoch": 0.20178, + "grad_norm": 0.70027015338177, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 20178 + }, + { + "epoch": 0.20179, + "grad_norm": 0.6990237671533888, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 20179 + }, + { + "epoch": 0.2018, + "grad_norm": 0.719796276637802, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 20180 + }, + { + "epoch": 0.20181, + "grad_norm": 0.7717792879200722, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 20181 + }, + { + "epoch": 0.20182, + "grad_norm": 0.8100546162235994, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 20182 + }, + { + "epoch": 0.20183, + "grad_norm": 0.8883890842393081, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 20183 + }, + { + "epoch": 0.20184, + "grad_norm": 1.0540406723671027, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 20184 + }, + { + "epoch": 0.20185, + "grad_norm": 1.073450857768077, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 20185 + }, + { + "epoch": 0.20186, + "grad_norm": 1.0054829814140989, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 20186 + }, + { + "epoch": 0.20187, + "grad_norm": 1.035973238825491, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 20187 + }, + { + "epoch": 0.20188, + "grad_norm": 0.8482124744198021, + "learning_rate": 0.003, + "loss": 4.075, + "step": 20188 + }, + { + "epoch": 0.20189, + "grad_norm": 0.8146430944762663, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 20189 + }, + { + "epoch": 0.2019, + "grad_norm": 0.6385110414005808, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 20190 + }, + { + "epoch": 0.20191, + "grad_norm": 0.6441925369623385, + "learning_rate": 0.003, + "loss": 4.075, + "step": 20191 + }, + { + "epoch": 0.20192, + "grad_norm": 0.6724475842316535, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 20192 + }, + { + "epoch": 0.20193, + "grad_norm": 0.6739740191097858, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 20193 + }, + { + "epoch": 0.20194, + "grad_norm": 0.6068180262602129, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 20194 + }, + { + "epoch": 0.20195, + "grad_norm": 0.6362317453360848, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 20195 + }, + { + "epoch": 0.20196, + "grad_norm": 0.6490550355193817, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 20196 + }, + { + "epoch": 0.20197, + "grad_norm": 0.7900011403127508, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 20197 + }, + { + "epoch": 0.20198, + "grad_norm": 0.9316803893003885, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 20198 + }, + { + "epoch": 0.20199, + "grad_norm": 1.1392797863745374, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 20199 + }, + { + "epoch": 0.202, + "grad_norm": 0.8636391596598383, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 20200 + }, + { + "epoch": 0.20201, + "grad_norm": 0.7846545207634401, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 20201 + }, + { + "epoch": 0.20202, + "grad_norm": 0.7852673460348296, + "learning_rate": 0.003, + "loss": 4.091, + "step": 20202 + }, + { + "epoch": 0.20203, + "grad_norm": 0.8523936286459818, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 20203 + }, + { + "epoch": 0.20204, + "grad_norm": 1.0220483032405032, + "learning_rate": 0.003, + "loss": 4.079, + "step": 20204 + }, + { + "epoch": 0.20205, + "grad_norm": 1.157570379991195, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 20205 + }, + { + "epoch": 0.20206, + "grad_norm": 1.0744460553145834, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 20206 + }, + { + "epoch": 0.20207, + "grad_norm": 1.039895689176448, + "learning_rate": 0.003, + "loss": 4.049, + "step": 20207 + }, + { + "epoch": 0.20208, + "grad_norm": 1.0432754314459542, + "learning_rate": 0.003, + "loss": 4.038, + "step": 20208 + }, + { + "epoch": 0.20209, + "grad_norm": 1.1489355832747565, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 20209 + }, + { + "epoch": 0.2021, + "grad_norm": 1.0043874100022252, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 20210 + }, + { + "epoch": 0.20211, + "grad_norm": 0.9361477626640528, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 20211 + }, + { + "epoch": 0.20212, + "grad_norm": 0.937678561593136, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 20212 + }, + { + "epoch": 0.20213, + "grad_norm": 0.9656190352445899, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 20213 + }, + { + "epoch": 0.20214, + "grad_norm": 1.002992366793816, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 20214 + }, + { + "epoch": 0.20215, + "grad_norm": 1.11072175656836, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 20215 + }, + { + "epoch": 0.20216, + "grad_norm": 0.8193804166077443, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 20216 + }, + { + "epoch": 0.20217, + "grad_norm": 0.8839116205628246, + "learning_rate": 0.003, + "loss": 4.047, + "step": 20217 + }, + { + "epoch": 0.20218, + "grad_norm": 0.836534442731943, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 20218 + }, + { + "epoch": 0.20219, + "grad_norm": 0.9775258920919472, + "learning_rate": 0.003, + "loss": 4.045, + "step": 20219 + }, + { + "epoch": 0.2022, + "grad_norm": 1.0743878029089904, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 20220 + }, + { + "epoch": 0.20221, + "grad_norm": 0.7787746213572123, + "learning_rate": 0.003, + "loss": 4.063, + "step": 20221 + }, + { + "epoch": 0.20222, + "grad_norm": 0.7756070732132835, + "learning_rate": 0.003, + "loss": 4.058, + "step": 20222 + }, + { + "epoch": 0.20223, + "grad_norm": 0.8074855184620184, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 20223 + }, + { + "epoch": 0.20224, + "grad_norm": 0.8148244426685318, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 20224 + }, + { + "epoch": 0.20225, + "grad_norm": 0.9368431478590745, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 20225 + }, + { + "epoch": 0.20226, + "grad_norm": 1.2806381041315096, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 20226 + }, + { + "epoch": 0.20227, + "grad_norm": 0.9253337166534994, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 20227 + }, + { + "epoch": 0.20228, + "grad_norm": 0.8986734494879194, + "learning_rate": 0.003, + "loss": 4.037, + "step": 20228 + }, + { + "epoch": 0.20229, + "grad_norm": 0.8509111303238335, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 20229 + }, + { + "epoch": 0.2023, + "grad_norm": 0.7731995600417705, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 20230 + }, + { + "epoch": 0.20231, + "grad_norm": 0.7066170308016798, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 20231 + }, + { + "epoch": 0.20232, + "grad_norm": 0.7137967052910865, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 20232 + }, + { + "epoch": 0.20233, + "grad_norm": 0.8261074430325647, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 20233 + }, + { + "epoch": 0.20234, + "grad_norm": 0.9237243484655616, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 20234 + }, + { + "epoch": 0.20235, + "grad_norm": 0.9572598754813575, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 20235 + }, + { + "epoch": 0.20236, + "grad_norm": 0.8782449084523822, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 20236 + }, + { + "epoch": 0.20237, + "grad_norm": 0.8087711252183536, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 20237 + }, + { + "epoch": 0.20238, + "grad_norm": 0.6463272323360605, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 20238 + }, + { + "epoch": 0.20239, + "grad_norm": 0.715465232808237, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 20239 + }, + { + "epoch": 0.2024, + "grad_norm": 0.6632546418356409, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 20240 + }, + { + "epoch": 0.20241, + "grad_norm": 0.7370390154758478, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 20241 + }, + { + "epoch": 0.20242, + "grad_norm": 0.8813729339328653, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 20242 + }, + { + "epoch": 0.20243, + "grad_norm": 0.975946981254012, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 20243 + }, + { + "epoch": 0.20244, + "grad_norm": 1.0454259863575712, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 20244 + }, + { + "epoch": 0.20245, + "grad_norm": 0.9757392887520446, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 20245 + }, + { + "epoch": 0.20246, + "grad_norm": 0.9733053043852302, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 20246 + }, + { + "epoch": 0.20247, + "grad_norm": 0.9515100434968863, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 20247 + }, + { + "epoch": 0.20248, + "grad_norm": 1.0458783512042407, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 20248 + }, + { + "epoch": 0.20249, + "grad_norm": 0.8950611747541216, + "learning_rate": 0.003, + "loss": 4.063, + "step": 20249 + }, + { + "epoch": 0.2025, + "grad_norm": 0.9453026772316225, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 20250 + }, + { + "epoch": 0.20251, + "grad_norm": 0.9299160323287975, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 20251 + }, + { + "epoch": 0.20252, + "grad_norm": 0.8638490249225725, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 20252 + }, + { + "epoch": 0.20253, + "grad_norm": 0.8863765910978502, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 20253 + }, + { + "epoch": 0.20254, + "grad_norm": 1.0156510446622808, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 20254 + }, + { + "epoch": 0.20255, + "grad_norm": 1.2454649637167758, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 20255 + }, + { + "epoch": 0.20256, + "grad_norm": 0.9437518589174526, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 20256 + }, + { + "epoch": 0.20257, + "grad_norm": 1.2598994010738565, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 20257 + }, + { + "epoch": 0.20258, + "grad_norm": 0.8993262476436259, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 20258 + }, + { + "epoch": 0.20259, + "grad_norm": 0.9342959878453537, + "learning_rate": 0.003, + "loss": 4.073, + "step": 20259 + }, + { + "epoch": 0.2026, + "grad_norm": 1.20464026324837, + "learning_rate": 0.003, + "loss": 4.074, + "step": 20260 + }, + { + "epoch": 0.20261, + "grad_norm": 1.1732592843323337, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 20261 + }, + { + "epoch": 0.20262, + "grad_norm": 1.0258714972568241, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 20262 + }, + { + "epoch": 0.20263, + "grad_norm": 1.0385998257771591, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 20263 + }, + { + "epoch": 0.20264, + "grad_norm": 0.8650415025419821, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 20264 + }, + { + "epoch": 0.20265, + "grad_norm": 0.8201303882430809, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 20265 + }, + { + "epoch": 0.20266, + "grad_norm": 0.6738704437704791, + "learning_rate": 0.003, + "loss": 4.046, + "step": 20266 + }, + { + "epoch": 0.20267, + "grad_norm": 0.6896656664142933, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 20267 + }, + { + "epoch": 0.20268, + "grad_norm": 0.7790935277931689, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 20268 + }, + { + "epoch": 0.20269, + "grad_norm": 0.8953200803606676, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 20269 + }, + { + "epoch": 0.2027, + "grad_norm": 0.8901116875320236, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 20270 + }, + { + "epoch": 0.20271, + "grad_norm": 1.0079781240388708, + "learning_rate": 0.003, + "loss": 4.087, + "step": 20271 + }, + { + "epoch": 0.20272, + "grad_norm": 1.112553755440471, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 20272 + }, + { + "epoch": 0.20273, + "grad_norm": 1.0474804000177926, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 20273 + }, + { + "epoch": 0.20274, + "grad_norm": 0.9931938834025967, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 20274 + }, + { + "epoch": 0.20275, + "grad_norm": 0.9872937962963939, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 20275 + }, + { + "epoch": 0.20276, + "grad_norm": 1.0901152333849533, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 20276 + }, + { + "epoch": 0.20277, + "grad_norm": 0.9215653619752749, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 20277 + }, + { + "epoch": 0.20278, + "grad_norm": 0.7634688018371093, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 20278 + }, + { + "epoch": 0.20279, + "grad_norm": 0.7297325604080956, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 20279 + }, + { + "epoch": 0.2028, + "grad_norm": 0.69564024170904, + "learning_rate": 0.003, + "loss": 4.016, + "step": 20280 + }, + { + "epoch": 0.20281, + "grad_norm": 0.7179749761523376, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 20281 + }, + { + "epoch": 0.20282, + "grad_norm": 0.6844268746086973, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 20282 + }, + { + "epoch": 0.20283, + "grad_norm": 0.5854488752875975, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 20283 + }, + { + "epoch": 0.20284, + "grad_norm": 0.5963854325655069, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 20284 + }, + { + "epoch": 0.20285, + "grad_norm": 0.6098938917123762, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 20285 + }, + { + "epoch": 0.20286, + "grad_norm": 0.5962729372147907, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 20286 + }, + { + "epoch": 0.20287, + "grad_norm": 0.6072888650026789, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 20287 + }, + { + "epoch": 0.20288, + "grad_norm": 0.6699623530934661, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 20288 + }, + { + "epoch": 0.20289, + "grad_norm": 0.5807723898612621, + "learning_rate": 0.003, + "loss": 4.059, + "step": 20289 + }, + { + "epoch": 0.2029, + "grad_norm": 0.6953945688022403, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 20290 + }, + { + "epoch": 0.20291, + "grad_norm": 0.8432955118802364, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 20291 + }, + { + "epoch": 0.20292, + "grad_norm": 0.7813820299037514, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 20292 + }, + { + "epoch": 0.20293, + "grad_norm": 0.7398672292874032, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 20293 + }, + { + "epoch": 0.20294, + "grad_norm": 0.9661154390823129, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 20294 + }, + { + "epoch": 0.20295, + "grad_norm": 1.4038009561188358, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 20295 + }, + { + "epoch": 0.20296, + "grad_norm": 1.0482101762314422, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 20296 + }, + { + "epoch": 0.20297, + "grad_norm": 1.0796339006398183, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 20297 + }, + { + "epoch": 0.20298, + "grad_norm": 1.0218724339767467, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 20298 + }, + { + "epoch": 0.20299, + "grad_norm": 1.014017049916483, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 20299 + }, + { + "epoch": 0.203, + "grad_norm": 0.9514457008536702, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 20300 + }, + { + "epoch": 0.20301, + "grad_norm": 0.9451901872687277, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 20301 + }, + { + "epoch": 0.20302, + "grad_norm": 0.9336624962525484, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 20302 + }, + { + "epoch": 0.20303, + "grad_norm": 0.9104782036823139, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 20303 + }, + { + "epoch": 0.20304, + "grad_norm": 0.8793264935377031, + "learning_rate": 0.003, + "loss": 4.047, + "step": 20304 + }, + { + "epoch": 0.20305, + "grad_norm": 0.920158024134499, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 20305 + }, + { + "epoch": 0.20306, + "grad_norm": 0.9829195665231453, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 20306 + }, + { + "epoch": 0.20307, + "grad_norm": 0.9827866420816818, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 20307 + }, + { + "epoch": 0.20308, + "grad_norm": 1.009209299906646, + "learning_rate": 0.003, + "loss": 4.06, + "step": 20308 + }, + { + "epoch": 0.20309, + "grad_norm": 0.9132899948677269, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 20309 + }, + { + "epoch": 0.2031, + "grad_norm": 0.830284613812082, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 20310 + }, + { + "epoch": 0.20311, + "grad_norm": 0.9944728139270856, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 20311 + }, + { + "epoch": 0.20312, + "grad_norm": 1.058719052336157, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 20312 + }, + { + "epoch": 0.20313, + "grad_norm": 0.9071972229971375, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 20313 + }, + { + "epoch": 0.20314, + "grad_norm": 0.9422127166926286, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 20314 + }, + { + "epoch": 0.20315, + "grad_norm": 0.9936909107378495, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 20315 + }, + { + "epoch": 0.20316, + "grad_norm": 1.162536321472046, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 20316 + }, + { + "epoch": 0.20317, + "grad_norm": 0.9194171985720333, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 20317 + }, + { + "epoch": 0.20318, + "grad_norm": 0.9935602341141521, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 20318 + }, + { + "epoch": 0.20319, + "grad_norm": 1.0851518539442337, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 20319 + }, + { + "epoch": 0.2032, + "grad_norm": 1.035486145364161, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 20320 + }, + { + "epoch": 0.20321, + "grad_norm": 1.091192956432024, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 20321 + }, + { + "epoch": 0.20322, + "grad_norm": 1.0004206355223928, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 20322 + }, + { + "epoch": 0.20323, + "grad_norm": 1.2288960990384825, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 20323 + }, + { + "epoch": 0.20324, + "grad_norm": 0.8304791348400947, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 20324 + }, + { + "epoch": 0.20325, + "grad_norm": 0.8064271454877502, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 20325 + }, + { + "epoch": 0.20326, + "grad_norm": 0.800986609943331, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 20326 + }, + { + "epoch": 0.20327, + "grad_norm": 0.6831816181323651, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 20327 + }, + { + "epoch": 0.20328, + "grad_norm": 0.6698244473488945, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 20328 + }, + { + "epoch": 0.20329, + "grad_norm": 0.6836107974459831, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 20329 + }, + { + "epoch": 0.2033, + "grad_norm": 0.6670341468558074, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 20330 + }, + { + "epoch": 0.20331, + "grad_norm": 0.6964106515599698, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 20331 + }, + { + "epoch": 0.20332, + "grad_norm": 0.8326064308556393, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 20332 + }, + { + "epoch": 0.20333, + "grad_norm": 0.8854471703948902, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 20333 + }, + { + "epoch": 0.20334, + "grad_norm": 0.9832363699873181, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 20334 + }, + { + "epoch": 0.20335, + "grad_norm": 0.9749256269296941, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 20335 + }, + { + "epoch": 0.20336, + "grad_norm": 0.9308474509699046, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 20336 + }, + { + "epoch": 0.20337, + "grad_norm": 0.8646646010288128, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 20337 + }, + { + "epoch": 0.20338, + "grad_norm": 0.7213441182344377, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 20338 + }, + { + "epoch": 0.20339, + "grad_norm": 0.8554332295457631, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 20339 + }, + { + "epoch": 0.2034, + "grad_norm": 1.150390161504409, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 20340 + }, + { + "epoch": 0.20341, + "grad_norm": 0.9841655065329525, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 20341 + }, + { + "epoch": 0.20342, + "grad_norm": 0.9712187061482087, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 20342 + }, + { + "epoch": 0.20343, + "grad_norm": 1.1508129114703876, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 20343 + }, + { + "epoch": 0.20344, + "grad_norm": 0.8801541375228003, + "learning_rate": 0.003, + "loss": 4.051, + "step": 20344 + }, + { + "epoch": 0.20345, + "grad_norm": 0.8338339714908491, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 20345 + }, + { + "epoch": 0.20346, + "grad_norm": 0.8981142243378957, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 20346 + }, + { + "epoch": 0.20347, + "grad_norm": 1.10750992003621, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 20347 + }, + { + "epoch": 0.20348, + "grad_norm": 0.9262372518515493, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 20348 + }, + { + "epoch": 0.20349, + "grad_norm": 0.9083627863692587, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 20349 + }, + { + "epoch": 0.2035, + "grad_norm": 0.9669370176657444, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 20350 + }, + { + "epoch": 0.20351, + "grad_norm": 0.9371760903200274, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 20351 + }, + { + "epoch": 0.20352, + "grad_norm": 1.0764525814282575, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 20352 + }, + { + "epoch": 0.20353, + "grad_norm": 0.9451992015799721, + "learning_rate": 0.003, + "loss": 4.059, + "step": 20353 + }, + { + "epoch": 0.20354, + "grad_norm": 0.9475795627950271, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 20354 + }, + { + "epoch": 0.20355, + "grad_norm": 0.8889366126293408, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 20355 + }, + { + "epoch": 0.20356, + "grad_norm": 0.8715422943535401, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 20356 + }, + { + "epoch": 0.20357, + "grad_norm": 0.8724064789591822, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 20357 + }, + { + "epoch": 0.20358, + "grad_norm": 0.7577581334178127, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 20358 + }, + { + "epoch": 0.20359, + "grad_norm": 0.849673720907657, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 20359 + }, + { + "epoch": 0.2036, + "grad_norm": 0.9464323635169056, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 20360 + }, + { + "epoch": 0.20361, + "grad_norm": 1.0132644342636135, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 20361 + }, + { + "epoch": 0.20362, + "grad_norm": 1.113709142377863, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 20362 + }, + { + "epoch": 0.20363, + "grad_norm": 1.096318058425732, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 20363 + }, + { + "epoch": 0.20364, + "grad_norm": 0.8918511224263473, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 20364 + }, + { + "epoch": 0.20365, + "grad_norm": 0.8814368778264271, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 20365 + }, + { + "epoch": 0.20366, + "grad_norm": 1.008547111121039, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 20366 + }, + { + "epoch": 0.20367, + "grad_norm": 0.9860100692413479, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 20367 + }, + { + "epoch": 0.20368, + "grad_norm": 0.8759594785251976, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 20368 + }, + { + "epoch": 0.20369, + "grad_norm": 0.9018001691983991, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 20369 + }, + { + "epoch": 0.2037, + "grad_norm": 0.9071969496138035, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 20370 + }, + { + "epoch": 0.20371, + "grad_norm": 0.9574620976549254, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 20371 + }, + { + "epoch": 0.20372, + "grad_norm": 1.1102602667409915, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 20372 + }, + { + "epoch": 0.20373, + "grad_norm": 0.8446084170221155, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 20373 + }, + { + "epoch": 0.20374, + "grad_norm": 0.7107658690378214, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 20374 + }, + { + "epoch": 0.20375, + "grad_norm": 0.6622757320982561, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 20375 + }, + { + "epoch": 0.20376, + "grad_norm": 0.7660040640261622, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 20376 + }, + { + "epoch": 0.20377, + "grad_norm": 0.8934638908111394, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 20377 + }, + { + "epoch": 0.20378, + "grad_norm": 1.0627034179668389, + "learning_rate": 0.003, + "loss": 4.06, + "step": 20378 + }, + { + "epoch": 0.20379, + "grad_norm": 1.0354579035375133, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 20379 + }, + { + "epoch": 0.2038, + "grad_norm": 0.9626189025237234, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 20380 + }, + { + "epoch": 0.20381, + "grad_norm": 0.8320616410557015, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 20381 + }, + { + "epoch": 0.20382, + "grad_norm": 0.7570190863218998, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 20382 + }, + { + "epoch": 0.20383, + "grad_norm": 0.8588888356312102, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 20383 + }, + { + "epoch": 0.20384, + "grad_norm": 0.8352489656803421, + "learning_rate": 0.003, + "loss": 4.089, + "step": 20384 + }, + { + "epoch": 0.20385, + "grad_norm": 0.8449320318114233, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 20385 + }, + { + "epoch": 0.20386, + "grad_norm": 0.9043727654539504, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 20386 + }, + { + "epoch": 0.20387, + "grad_norm": 0.9262840876713996, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 20387 + }, + { + "epoch": 0.20388, + "grad_norm": 0.7633299488947757, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 20388 + }, + { + "epoch": 0.20389, + "grad_norm": 0.7089061109988785, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 20389 + }, + { + "epoch": 0.2039, + "grad_norm": 0.6852516371589236, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 20390 + }, + { + "epoch": 0.20391, + "grad_norm": 0.6890443833152825, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 20391 + }, + { + "epoch": 0.20392, + "grad_norm": 0.6950555176149104, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 20392 + }, + { + "epoch": 0.20393, + "grad_norm": 0.7488331772603604, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 20393 + }, + { + "epoch": 0.20394, + "grad_norm": 0.9987496497416537, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 20394 + }, + { + "epoch": 0.20395, + "grad_norm": 1.4270099794959688, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 20395 + }, + { + "epoch": 0.20396, + "grad_norm": 0.6885336538394853, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 20396 + }, + { + "epoch": 0.20397, + "grad_norm": 0.7090730847797369, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 20397 + }, + { + "epoch": 0.20398, + "grad_norm": 0.6994073842065481, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 20398 + }, + { + "epoch": 0.20399, + "grad_norm": 0.8458455248697703, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 20399 + }, + { + "epoch": 0.204, + "grad_norm": 0.9695888822578032, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 20400 + }, + { + "epoch": 0.20401, + "grad_norm": 1.078946888911514, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 20401 + }, + { + "epoch": 0.20402, + "grad_norm": 1.1196792709555141, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 20402 + }, + { + "epoch": 0.20403, + "grad_norm": 1.3348823087617145, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 20403 + }, + { + "epoch": 0.20404, + "grad_norm": 0.8009892149025997, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 20404 + }, + { + "epoch": 0.20405, + "grad_norm": 0.7065100835983656, + "learning_rate": 0.003, + "loss": 4.038, + "step": 20405 + }, + { + "epoch": 0.20406, + "grad_norm": 0.7573713286699112, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 20406 + }, + { + "epoch": 0.20407, + "grad_norm": 0.6967802330299365, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 20407 + }, + { + "epoch": 0.20408, + "grad_norm": 0.8271654350425968, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 20408 + }, + { + "epoch": 0.20409, + "grad_norm": 0.9871896047370817, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 20409 + }, + { + "epoch": 0.2041, + "grad_norm": 1.0198483403852818, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 20410 + }, + { + "epoch": 0.20411, + "grad_norm": 0.9487269969845029, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 20411 + }, + { + "epoch": 0.20412, + "grad_norm": 0.8475615412902817, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 20412 + }, + { + "epoch": 0.20413, + "grad_norm": 0.759605745291794, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 20413 + }, + { + "epoch": 0.20414, + "grad_norm": 0.695891917218297, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 20414 + }, + { + "epoch": 0.20415, + "grad_norm": 0.6323612963788035, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 20415 + }, + { + "epoch": 0.20416, + "grad_norm": 0.8703389776793776, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 20416 + }, + { + "epoch": 0.20417, + "grad_norm": 1.3567765720103246, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 20417 + }, + { + "epoch": 0.20418, + "grad_norm": 0.9088617391430327, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 20418 + }, + { + "epoch": 0.20419, + "grad_norm": 0.9305037687687804, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 20419 + }, + { + "epoch": 0.2042, + "grad_norm": 0.9393247490018447, + "learning_rate": 0.003, + "loss": 4.047, + "step": 20420 + }, + { + "epoch": 0.20421, + "grad_norm": 1.084251376364569, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 20421 + }, + { + "epoch": 0.20422, + "grad_norm": 1.0480779731881558, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 20422 + }, + { + "epoch": 0.20423, + "grad_norm": 1.096185344863322, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 20423 + }, + { + "epoch": 0.20424, + "grad_norm": 0.8554250545045515, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 20424 + }, + { + "epoch": 0.20425, + "grad_norm": 0.7402055273951321, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 20425 + }, + { + "epoch": 0.20426, + "grad_norm": 0.7666717015350778, + "learning_rate": 0.003, + "loss": 4.056, + "step": 20426 + }, + { + "epoch": 0.20427, + "grad_norm": 0.8699294213206213, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 20427 + }, + { + "epoch": 0.20428, + "grad_norm": 1.036872305261095, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 20428 + }, + { + "epoch": 0.20429, + "grad_norm": 1.0979866792095243, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 20429 + }, + { + "epoch": 0.2043, + "grad_norm": 0.7783377055456764, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 20430 + }, + { + "epoch": 0.20431, + "grad_norm": 0.5795730695623161, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 20431 + }, + { + "epoch": 0.20432, + "grad_norm": 0.5990744608085088, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 20432 + }, + { + "epoch": 0.20433, + "grad_norm": 0.8101839593489611, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 20433 + }, + { + "epoch": 0.20434, + "grad_norm": 0.9516803451186477, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 20434 + }, + { + "epoch": 0.20435, + "grad_norm": 0.9590346770043936, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 20435 + }, + { + "epoch": 0.20436, + "grad_norm": 0.8573206563355135, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 20436 + }, + { + "epoch": 0.20437, + "grad_norm": 0.789035545952203, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 20437 + }, + { + "epoch": 0.20438, + "grad_norm": 0.8466330985259529, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 20438 + }, + { + "epoch": 0.20439, + "grad_norm": 0.8619669794560092, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 20439 + }, + { + "epoch": 0.2044, + "grad_norm": 1.0426690094942102, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 20440 + }, + { + "epoch": 0.20441, + "grad_norm": 1.116082469843466, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 20441 + }, + { + "epoch": 0.20442, + "grad_norm": 0.9464521441924234, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 20442 + }, + { + "epoch": 0.20443, + "grad_norm": 0.9802945819296564, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 20443 + }, + { + "epoch": 0.20444, + "grad_norm": 1.0553713145146217, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 20444 + }, + { + "epoch": 0.20445, + "grad_norm": 0.9954685649700676, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 20445 + }, + { + "epoch": 0.20446, + "grad_norm": 0.9494693143464139, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 20446 + }, + { + "epoch": 0.20447, + "grad_norm": 0.833211628258708, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 20447 + }, + { + "epoch": 0.20448, + "grad_norm": 0.7456359889608268, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 20448 + }, + { + "epoch": 0.20449, + "grad_norm": 0.8680749039270492, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 20449 + }, + { + "epoch": 0.2045, + "grad_norm": 1.0406741653471223, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 20450 + }, + { + "epoch": 0.20451, + "grad_norm": 1.105849498852683, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 20451 + }, + { + "epoch": 0.20452, + "grad_norm": 0.9190408394695084, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 20452 + }, + { + "epoch": 0.20453, + "grad_norm": 0.7934413235091964, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 20453 + }, + { + "epoch": 0.20454, + "grad_norm": 0.6756722314647762, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 20454 + }, + { + "epoch": 0.20455, + "grad_norm": 0.7120986098490817, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 20455 + }, + { + "epoch": 0.20456, + "grad_norm": 0.7610572564116208, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 20456 + }, + { + "epoch": 0.20457, + "grad_norm": 0.7434780570804814, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 20457 + }, + { + "epoch": 0.20458, + "grad_norm": 0.7141520120292778, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 20458 + }, + { + "epoch": 0.20459, + "grad_norm": 0.7490897191636853, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 20459 + }, + { + "epoch": 0.2046, + "grad_norm": 0.7523301229952136, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 20460 + }, + { + "epoch": 0.20461, + "grad_norm": 0.7648327990348492, + "learning_rate": 0.003, + "loss": 4.054, + "step": 20461 + }, + { + "epoch": 0.20462, + "grad_norm": 0.7794387966822662, + "learning_rate": 0.003, + "loss": 4.049, + "step": 20462 + }, + { + "epoch": 0.20463, + "grad_norm": 0.9098949286973006, + "learning_rate": 0.003, + "loss": 4.042, + "step": 20463 + }, + { + "epoch": 0.20464, + "grad_norm": 1.1543422982397333, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 20464 + }, + { + "epoch": 0.20465, + "grad_norm": 1.0639337478565065, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 20465 + }, + { + "epoch": 0.20466, + "grad_norm": 0.9650600560385093, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 20466 + }, + { + "epoch": 0.20467, + "grad_norm": 0.8787347611544989, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 20467 + }, + { + "epoch": 0.20468, + "grad_norm": 0.8979324706387466, + "learning_rate": 0.003, + "loss": 4.067, + "step": 20468 + }, + { + "epoch": 0.20469, + "grad_norm": 0.9565197110482284, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 20469 + }, + { + "epoch": 0.2047, + "grad_norm": 1.001766424582976, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 20470 + }, + { + "epoch": 0.20471, + "grad_norm": 0.905690577277381, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 20471 + }, + { + "epoch": 0.20472, + "grad_norm": 0.8582802231298753, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 20472 + }, + { + "epoch": 0.20473, + "grad_norm": 1.0099994274527537, + "learning_rate": 0.003, + "loss": 4.068, + "step": 20473 + }, + { + "epoch": 0.20474, + "grad_norm": 1.1946003596306265, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 20474 + }, + { + "epoch": 0.20475, + "grad_norm": 0.8619692680160503, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 20475 + }, + { + "epoch": 0.20476, + "grad_norm": 0.9559448796469671, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 20476 + }, + { + "epoch": 0.20477, + "grad_norm": 0.9519911928111551, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 20477 + }, + { + "epoch": 0.20478, + "grad_norm": 1.1173372621957183, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 20478 + }, + { + "epoch": 0.20479, + "grad_norm": 1.060764951724005, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 20479 + }, + { + "epoch": 0.2048, + "grad_norm": 0.9867478296946318, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 20480 + }, + { + "epoch": 0.20481, + "grad_norm": 0.8170563073509959, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 20481 + }, + { + "epoch": 0.20482, + "grad_norm": 0.6254490750945555, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 20482 + }, + { + "epoch": 0.20483, + "grad_norm": 0.7207533839917315, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 20483 + }, + { + "epoch": 0.20484, + "grad_norm": 0.7418929333969281, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 20484 + }, + { + "epoch": 0.20485, + "grad_norm": 0.7465021535974078, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 20485 + }, + { + "epoch": 0.20486, + "grad_norm": 0.7769528365946727, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 20486 + }, + { + "epoch": 0.20487, + "grad_norm": 0.8130562766781182, + "learning_rate": 0.003, + "loss": 4.069, + "step": 20487 + }, + { + "epoch": 0.20488, + "grad_norm": 0.983521172208961, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 20488 + }, + { + "epoch": 0.20489, + "grad_norm": 1.0477977456666285, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 20489 + }, + { + "epoch": 0.2049, + "grad_norm": 0.846817393957543, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 20490 + }, + { + "epoch": 0.20491, + "grad_norm": 0.8217040692277149, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 20491 + }, + { + "epoch": 0.20492, + "grad_norm": 0.8152888096564821, + "learning_rate": 0.003, + "loss": 4.054, + "step": 20492 + }, + { + "epoch": 0.20493, + "grad_norm": 0.8988752345047367, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 20493 + }, + { + "epoch": 0.20494, + "grad_norm": 0.8345402495927507, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 20494 + }, + { + "epoch": 0.20495, + "grad_norm": 0.8456013999689221, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 20495 + }, + { + "epoch": 0.20496, + "grad_norm": 0.9663470825220668, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 20496 + }, + { + "epoch": 0.20497, + "grad_norm": 0.9229345421813118, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 20497 + }, + { + "epoch": 0.20498, + "grad_norm": 1.0108427865567675, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 20498 + }, + { + "epoch": 0.20499, + "grad_norm": 1.1721512182378613, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 20499 + }, + { + "epoch": 0.205, + "grad_norm": 1.2164014831498786, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 20500 + }, + { + "epoch": 0.20501, + "grad_norm": 0.7620436039619853, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 20501 + }, + { + "epoch": 0.20502, + "grad_norm": 0.6769871006699449, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 20502 + }, + { + "epoch": 0.20503, + "grad_norm": 0.754603163994319, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 20503 + }, + { + "epoch": 0.20504, + "grad_norm": 0.730669852587436, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 20504 + }, + { + "epoch": 0.20505, + "grad_norm": 0.7129773647468785, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 20505 + }, + { + "epoch": 0.20506, + "grad_norm": 0.6374569852668859, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 20506 + }, + { + "epoch": 0.20507, + "grad_norm": 0.594683379131078, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 20507 + }, + { + "epoch": 0.20508, + "grad_norm": 0.6885703086274533, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 20508 + }, + { + "epoch": 0.20509, + "grad_norm": 0.7470187540028812, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 20509 + }, + { + "epoch": 0.2051, + "grad_norm": 0.6680552314952584, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 20510 + }, + { + "epoch": 0.20511, + "grad_norm": 0.7365770870646581, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 20511 + }, + { + "epoch": 0.20512, + "grad_norm": 1.1206830302869843, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 20512 + }, + { + "epoch": 0.20513, + "grad_norm": 1.3611316699735958, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 20513 + }, + { + "epoch": 0.20514, + "grad_norm": 0.7911528604006666, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 20514 + }, + { + "epoch": 0.20515, + "grad_norm": 0.7922819361015677, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 20515 + }, + { + "epoch": 0.20516, + "grad_norm": 0.8012931274001179, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 20516 + }, + { + "epoch": 0.20517, + "grad_norm": 0.8098025953248225, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 20517 + }, + { + "epoch": 0.20518, + "grad_norm": 0.8653460020279925, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 20518 + }, + { + "epoch": 0.20519, + "grad_norm": 0.9427554891941522, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 20519 + }, + { + "epoch": 0.2052, + "grad_norm": 1.228778011738753, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 20520 + }, + { + "epoch": 0.20521, + "grad_norm": 0.8243281934386077, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 20521 + }, + { + "epoch": 0.20522, + "grad_norm": 0.7788285962108208, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 20522 + }, + { + "epoch": 0.20523, + "grad_norm": 0.9339471235267384, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 20523 + }, + { + "epoch": 0.20524, + "grad_norm": 1.0396932139833395, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 20524 + }, + { + "epoch": 0.20525, + "grad_norm": 1.005402172635994, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 20525 + }, + { + "epoch": 0.20526, + "grad_norm": 1.053508768873512, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 20526 + }, + { + "epoch": 0.20527, + "grad_norm": 1.2508069802693835, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 20527 + }, + { + "epoch": 0.20528, + "grad_norm": 0.8663705990405522, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 20528 + }, + { + "epoch": 0.20529, + "grad_norm": 0.8818611675931979, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 20529 + }, + { + "epoch": 0.2053, + "grad_norm": 0.9200535337245385, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 20530 + }, + { + "epoch": 0.20531, + "grad_norm": 0.9898780949855865, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 20531 + }, + { + "epoch": 0.20532, + "grad_norm": 0.9578834118378134, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 20532 + }, + { + "epoch": 0.20533, + "grad_norm": 0.9052817941059021, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 20533 + }, + { + "epoch": 0.20534, + "grad_norm": 0.779722377291353, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 20534 + }, + { + "epoch": 0.20535, + "grad_norm": 0.7346532313137373, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 20535 + }, + { + "epoch": 0.20536, + "grad_norm": 0.7093779802165369, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 20536 + }, + { + "epoch": 0.20537, + "grad_norm": 0.7930251293994007, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 20537 + }, + { + "epoch": 0.20538, + "grad_norm": 0.8530749088036973, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 20538 + }, + { + "epoch": 0.20539, + "grad_norm": 0.8372602973870564, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 20539 + }, + { + "epoch": 0.2054, + "grad_norm": 0.863324991148601, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 20540 + }, + { + "epoch": 0.20541, + "grad_norm": 0.9112910016192655, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 20541 + }, + { + "epoch": 0.20542, + "grad_norm": 0.9433012968095559, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 20542 + }, + { + "epoch": 0.20543, + "grad_norm": 1.0913271019690083, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 20543 + }, + { + "epoch": 0.20544, + "grad_norm": 0.8549544332057314, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 20544 + }, + { + "epoch": 0.20545, + "grad_norm": 0.914582378517783, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 20545 + }, + { + "epoch": 0.20546, + "grad_norm": 0.9273838748901237, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 20546 + }, + { + "epoch": 0.20547, + "grad_norm": 0.8948707723708691, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 20547 + }, + { + "epoch": 0.20548, + "grad_norm": 0.8302326146067596, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 20548 + }, + { + "epoch": 0.20549, + "grad_norm": 0.7696476080082122, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 20549 + }, + { + "epoch": 0.2055, + "grad_norm": 0.8407035755703616, + "learning_rate": 0.003, + "loss": 4.071, + "step": 20550 + }, + { + "epoch": 0.20551, + "grad_norm": 0.9257701985007101, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 20551 + }, + { + "epoch": 0.20552, + "grad_norm": 1.1374062132549967, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 20552 + }, + { + "epoch": 0.20553, + "grad_norm": 1.2789927383290687, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 20553 + }, + { + "epoch": 0.20554, + "grad_norm": 1.00202249688981, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 20554 + }, + { + "epoch": 0.20555, + "grad_norm": 0.9794066785980535, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 20555 + }, + { + "epoch": 0.20556, + "grad_norm": 0.9513503323769013, + "learning_rate": 0.003, + "loss": 4.045, + "step": 20556 + }, + { + "epoch": 0.20557, + "grad_norm": 0.9914266893871104, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 20557 + }, + { + "epoch": 0.20558, + "grad_norm": 0.9730358479260954, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 20558 + }, + { + "epoch": 0.20559, + "grad_norm": 0.9337728936224277, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 20559 + }, + { + "epoch": 0.2056, + "grad_norm": 0.8117965590914733, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 20560 + }, + { + "epoch": 0.20561, + "grad_norm": 0.6904545134598877, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 20561 + }, + { + "epoch": 0.20562, + "grad_norm": 0.6025300536075463, + "learning_rate": 0.003, + "loss": 4.057, + "step": 20562 + }, + { + "epoch": 0.20563, + "grad_norm": 0.6224085371276316, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 20563 + }, + { + "epoch": 0.20564, + "grad_norm": 0.6953086951954939, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 20564 + }, + { + "epoch": 0.20565, + "grad_norm": 0.8842491341529575, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 20565 + }, + { + "epoch": 0.20566, + "grad_norm": 1.1292077988719111, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 20566 + }, + { + "epoch": 0.20567, + "grad_norm": 0.9055261812823389, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 20567 + }, + { + "epoch": 0.20568, + "grad_norm": 0.8591778891675724, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 20568 + }, + { + "epoch": 0.20569, + "grad_norm": 0.7893300512642796, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 20569 + }, + { + "epoch": 0.2057, + "grad_norm": 0.7695695498222233, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 20570 + }, + { + "epoch": 0.20571, + "grad_norm": 0.7953962449919407, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 20571 + }, + { + "epoch": 0.20572, + "grad_norm": 0.7986956299090745, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 20572 + }, + { + "epoch": 0.20573, + "grad_norm": 0.7623400127898218, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 20573 + }, + { + "epoch": 0.20574, + "grad_norm": 0.8318733489796745, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 20574 + }, + { + "epoch": 0.20575, + "grad_norm": 0.8858423352603446, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 20575 + }, + { + "epoch": 0.20576, + "grad_norm": 0.8172688016100488, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 20576 + }, + { + "epoch": 0.20577, + "grad_norm": 0.7901722138410501, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 20577 + }, + { + "epoch": 0.20578, + "grad_norm": 0.9796718420688237, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 20578 + }, + { + "epoch": 0.20579, + "grad_norm": 1.2726068969477138, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 20579 + }, + { + "epoch": 0.2058, + "grad_norm": 0.9736309551465265, + "learning_rate": 0.003, + "loss": 4.074, + "step": 20580 + }, + { + "epoch": 0.20581, + "grad_norm": 1.0251965502061382, + "learning_rate": 0.003, + "loss": 4.052, + "step": 20581 + }, + { + "epoch": 0.20582, + "grad_norm": 0.9175721249187118, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 20582 + }, + { + "epoch": 0.20583, + "grad_norm": 1.0244517007583627, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 20583 + }, + { + "epoch": 0.20584, + "grad_norm": 1.0322391854094188, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 20584 + }, + { + "epoch": 0.20585, + "grad_norm": 1.0344090146461153, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 20585 + }, + { + "epoch": 0.20586, + "grad_norm": 0.9806018690955354, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 20586 + }, + { + "epoch": 0.20587, + "grad_norm": 0.7852323290753973, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 20587 + }, + { + "epoch": 0.20588, + "grad_norm": 0.7715173013911564, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 20588 + }, + { + "epoch": 0.20589, + "grad_norm": 0.6826629659862694, + "learning_rate": 0.003, + "loss": 4.047, + "step": 20589 + }, + { + "epoch": 0.2059, + "grad_norm": 0.7476244055476694, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 20590 + }, + { + "epoch": 0.20591, + "grad_norm": 0.6936211816688176, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 20591 + }, + { + "epoch": 0.20592, + "grad_norm": 0.7362555338656742, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 20592 + }, + { + "epoch": 0.20593, + "grad_norm": 0.7607218171203771, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 20593 + }, + { + "epoch": 0.20594, + "grad_norm": 0.7840412765664337, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 20594 + }, + { + "epoch": 0.20595, + "grad_norm": 0.8361335106376782, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 20595 + }, + { + "epoch": 0.20596, + "grad_norm": 1.077492727067051, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 20596 + }, + { + "epoch": 0.20597, + "grad_norm": 1.3908877765779695, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 20597 + }, + { + "epoch": 0.20598, + "grad_norm": 0.7605470713242518, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 20598 + }, + { + "epoch": 0.20599, + "grad_norm": 0.6904481401622857, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 20599 + }, + { + "epoch": 0.206, + "grad_norm": 0.6344621316372291, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 20600 + }, + { + "epoch": 0.20601, + "grad_norm": 0.6366942136686657, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 20601 + }, + { + "epoch": 0.20602, + "grad_norm": 0.6067438234087, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 20602 + }, + { + "epoch": 0.20603, + "grad_norm": 0.7246511436585634, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 20603 + }, + { + "epoch": 0.20604, + "grad_norm": 1.0320882304582961, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 20604 + }, + { + "epoch": 0.20605, + "grad_norm": 1.2606764251968554, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 20605 + }, + { + "epoch": 0.20606, + "grad_norm": 0.7289654659007048, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 20606 + }, + { + "epoch": 0.20607, + "grad_norm": 0.718277453279945, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 20607 + }, + { + "epoch": 0.20608, + "grad_norm": 0.790622781305276, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 20608 + }, + { + "epoch": 0.20609, + "grad_norm": 0.9594703957779677, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 20609 + }, + { + "epoch": 0.2061, + "grad_norm": 1.2055963398714216, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 20610 + }, + { + "epoch": 0.20611, + "grad_norm": 1.019951400337982, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 20611 + }, + { + "epoch": 0.20612, + "grad_norm": 1.0161482326347722, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 20612 + }, + { + "epoch": 0.20613, + "grad_norm": 0.9709674734422439, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 20613 + }, + { + "epoch": 0.20614, + "grad_norm": 0.9265254257637886, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 20614 + }, + { + "epoch": 0.20615, + "grad_norm": 0.9423442343289178, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 20615 + }, + { + "epoch": 0.20616, + "grad_norm": 1.0135090207025705, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 20616 + }, + { + "epoch": 0.20617, + "grad_norm": 1.1006216032314815, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 20617 + }, + { + "epoch": 0.20618, + "grad_norm": 0.954842336561467, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 20618 + }, + { + "epoch": 0.20619, + "grad_norm": 0.7835498651032616, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 20619 + }, + { + "epoch": 0.2062, + "grad_norm": 0.807016408459512, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 20620 + }, + { + "epoch": 0.20621, + "grad_norm": 1.0172214859491475, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 20621 + }, + { + "epoch": 0.20622, + "grad_norm": 1.2010884728781617, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 20622 + }, + { + "epoch": 0.20623, + "grad_norm": 0.9414728170219171, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 20623 + }, + { + "epoch": 0.20624, + "grad_norm": 1.0085687448405847, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 20624 + }, + { + "epoch": 0.20625, + "grad_norm": 0.9953683575782653, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 20625 + }, + { + "epoch": 0.20626, + "grad_norm": 0.9928068630689973, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 20626 + }, + { + "epoch": 0.20627, + "grad_norm": 0.9844217324248021, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 20627 + }, + { + "epoch": 0.20628, + "grad_norm": 1.0190362124410843, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 20628 + }, + { + "epoch": 0.20629, + "grad_norm": 0.9439978860033111, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 20629 + }, + { + "epoch": 0.2063, + "grad_norm": 0.9253956720686918, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 20630 + }, + { + "epoch": 0.20631, + "grad_norm": 1.0515591053619255, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 20631 + }, + { + "epoch": 0.20632, + "grad_norm": 1.023671020246924, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 20632 + }, + { + "epoch": 0.20633, + "grad_norm": 0.86648143766903, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 20633 + }, + { + "epoch": 0.20634, + "grad_norm": 0.8333367278901644, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 20634 + }, + { + "epoch": 0.20635, + "grad_norm": 0.8647866587051434, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 20635 + }, + { + "epoch": 0.20636, + "grad_norm": 1.020750856654751, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 20636 + }, + { + "epoch": 0.20637, + "grad_norm": 1.204925371161771, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 20637 + }, + { + "epoch": 0.20638, + "grad_norm": 0.7568887978599166, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 20638 + }, + { + "epoch": 0.20639, + "grad_norm": 0.7852977782044938, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 20639 + }, + { + "epoch": 0.2064, + "grad_norm": 0.8388052677801658, + "learning_rate": 0.003, + "loss": 4.073, + "step": 20640 + }, + { + "epoch": 0.20641, + "grad_norm": 0.891758605286488, + "learning_rate": 0.003, + "loss": 4.059, + "step": 20641 + }, + { + "epoch": 0.20642, + "grad_norm": 0.9364290039788794, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 20642 + }, + { + "epoch": 0.20643, + "grad_norm": 1.1980112051647338, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 20643 + }, + { + "epoch": 0.20644, + "grad_norm": 1.0793532517196802, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 20644 + }, + { + "epoch": 0.20645, + "grad_norm": 0.965015504488339, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 20645 + }, + { + "epoch": 0.20646, + "grad_norm": 1.0813336793108117, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 20646 + }, + { + "epoch": 0.20647, + "grad_norm": 0.9981493057543689, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 20647 + }, + { + "epoch": 0.20648, + "grad_norm": 0.8428348987809836, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 20648 + }, + { + "epoch": 0.20649, + "grad_norm": 0.6849644189236285, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 20649 + }, + { + "epoch": 0.2065, + "grad_norm": 0.6017190142529987, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 20650 + }, + { + "epoch": 0.20651, + "grad_norm": 0.6250684217321145, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 20651 + }, + { + "epoch": 0.20652, + "grad_norm": 0.6188968935601046, + "learning_rate": 0.003, + "loss": 3.9993, + "step": 20652 + }, + { + "epoch": 0.20653, + "grad_norm": 0.5363667060974162, + "learning_rate": 0.003, + "loss": 4.046, + "step": 20653 + }, + { + "epoch": 0.20654, + "grad_norm": 0.6347384871845065, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 20654 + }, + { + "epoch": 0.20655, + "grad_norm": 0.8032137065026761, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 20655 + }, + { + "epoch": 0.20656, + "grad_norm": 1.0406968752417656, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 20656 + }, + { + "epoch": 0.20657, + "grad_norm": 1.0931001468798138, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 20657 + }, + { + "epoch": 0.20658, + "grad_norm": 0.8447870136672336, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 20658 + }, + { + "epoch": 0.20659, + "grad_norm": 0.741803508048347, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 20659 + }, + { + "epoch": 0.2066, + "grad_norm": 0.7609061056370786, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 20660 + }, + { + "epoch": 0.20661, + "grad_norm": 0.8243425693126477, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 20661 + }, + { + "epoch": 0.20662, + "grad_norm": 0.8062762459485772, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 20662 + }, + { + "epoch": 0.20663, + "grad_norm": 0.9261261062417931, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 20663 + }, + { + "epoch": 0.20664, + "grad_norm": 1.2020025510664534, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 20664 + }, + { + "epoch": 0.20665, + "grad_norm": 0.9958478965450076, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 20665 + }, + { + "epoch": 0.20666, + "grad_norm": 0.9435912414102534, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 20666 + }, + { + "epoch": 0.20667, + "grad_norm": 0.7709152620398154, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 20667 + }, + { + "epoch": 0.20668, + "grad_norm": 0.6865848655986527, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 20668 + }, + { + "epoch": 0.20669, + "grad_norm": 0.7534612856236368, + "learning_rate": 0.003, + "loss": 4.016, + "step": 20669 + }, + { + "epoch": 0.2067, + "grad_norm": 0.7970881081505984, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 20670 + }, + { + "epoch": 0.20671, + "grad_norm": 0.9914107819593648, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 20671 + }, + { + "epoch": 0.20672, + "grad_norm": 1.4812493275344005, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 20672 + }, + { + "epoch": 0.20673, + "grad_norm": 0.7589968412493799, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 20673 + }, + { + "epoch": 0.20674, + "grad_norm": 0.7745651088422716, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 20674 + }, + { + "epoch": 0.20675, + "grad_norm": 1.045882241928847, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 20675 + }, + { + "epoch": 0.20676, + "grad_norm": 1.2303870064394389, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 20676 + }, + { + "epoch": 0.20677, + "grad_norm": 0.7953019631338488, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 20677 + }, + { + "epoch": 0.20678, + "grad_norm": 0.738991276268295, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 20678 + }, + { + "epoch": 0.20679, + "grad_norm": 0.845274045959646, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 20679 + }, + { + "epoch": 0.2068, + "grad_norm": 0.8769337009639483, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 20680 + }, + { + "epoch": 0.20681, + "grad_norm": 0.9376339161457683, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 20681 + }, + { + "epoch": 0.20682, + "grad_norm": 0.911274612007686, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 20682 + }, + { + "epoch": 0.20683, + "grad_norm": 0.9139237733288875, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 20683 + }, + { + "epoch": 0.20684, + "grad_norm": 0.9076066287361105, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 20684 + }, + { + "epoch": 0.20685, + "grad_norm": 1.1126390912737316, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 20685 + }, + { + "epoch": 0.20686, + "grad_norm": 1.020779907172978, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 20686 + }, + { + "epoch": 0.20687, + "grad_norm": 0.8989671962564679, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 20687 + }, + { + "epoch": 0.20688, + "grad_norm": 0.9025550399033526, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 20688 + }, + { + "epoch": 0.20689, + "grad_norm": 0.9019164380921044, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 20689 + }, + { + "epoch": 0.2069, + "grad_norm": 0.953989204410349, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 20690 + }, + { + "epoch": 0.20691, + "grad_norm": 0.9263957419699532, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 20691 + }, + { + "epoch": 0.20692, + "grad_norm": 1.024185703940695, + "learning_rate": 0.003, + "loss": 4.08, + "step": 20692 + }, + { + "epoch": 0.20693, + "grad_norm": 1.1113324609358632, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 20693 + }, + { + "epoch": 0.20694, + "grad_norm": 1.0667487243293525, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 20694 + }, + { + "epoch": 0.20695, + "grad_norm": 1.0667691079896835, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 20695 + }, + { + "epoch": 0.20696, + "grad_norm": 0.9645752396874048, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 20696 + }, + { + "epoch": 0.20697, + "grad_norm": 0.8993716925969667, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 20697 + }, + { + "epoch": 0.20698, + "grad_norm": 0.7341038316597985, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 20698 + }, + { + "epoch": 0.20699, + "grad_norm": 0.7079950792544272, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 20699 + }, + { + "epoch": 0.207, + "grad_norm": 0.6393011363260942, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 20700 + }, + { + "epoch": 0.20701, + "grad_norm": 0.6529282234544391, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 20701 + }, + { + "epoch": 0.20702, + "grad_norm": 0.7591582713524963, + "learning_rate": 0.003, + "loss": 4.071, + "step": 20702 + }, + { + "epoch": 0.20703, + "grad_norm": 0.8399027560375545, + "learning_rate": 0.003, + "loss": 4.039, + "step": 20703 + }, + { + "epoch": 0.20704, + "grad_norm": 1.0229046265192852, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 20704 + }, + { + "epoch": 0.20705, + "grad_norm": 1.2528591896392958, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 20705 + }, + { + "epoch": 0.20706, + "grad_norm": 0.7339260783515749, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 20706 + }, + { + "epoch": 0.20707, + "grad_norm": 0.6933835542923777, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 20707 + }, + { + "epoch": 0.20708, + "grad_norm": 0.7133414719199157, + "learning_rate": 0.003, + "loss": 4.066, + "step": 20708 + }, + { + "epoch": 0.20709, + "grad_norm": 0.6871709880104687, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 20709 + }, + { + "epoch": 0.2071, + "grad_norm": 0.7342594176801578, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 20710 + }, + { + "epoch": 0.20711, + "grad_norm": 0.809280070894089, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 20711 + }, + { + "epoch": 0.20712, + "grad_norm": 0.9073706527628451, + "learning_rate": 0.003, + "loss": 4.071, + "step": 20712 + }, + { + "epoch": 0.20713, + "grad_norm": 0.843087423365037, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 20713 + }, + { + "epoch": 0.20714, + "grad_norm": 0.9121406100229121, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 20714 + }, + { + "epoch": 0.20715, + "grad_norm": 1.1185977770452542, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 20715 + }, + { + "epoch": 0.20716, + "grad_norm": 0.9665679098861336, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 20716 + }, + { + "epoch": 0.20717, + "grad_norm": 1.0138902289609497, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 20717 + }, + { + "epoch": 0.20718, + "grad_norm": 1.0341541210214493, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 20718 + }, + { + "epoch": 0.20719, + "grad_norm": 0.909857907822015, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 20719 + }, + { + "epoch": 0.2072, + "grad_norm": 0.883878385228263, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 20720 + }, + { + "epoch": 0.20721, + "grad_norm": 0.9663190029626658, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 20721 + }, + { + "epoch": 0.20722, + "grad_norm": 0.9569429956011049, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 20722 + }, + { + "epoch": 0.20723, + "grad_norm": 0.7961455675081555, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 20723 + }, + { + "epoch": 0.20724, + "grad_norm": 0.7822988601781593, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 20724 + }, + { + "epoch": 0.20725, + "grad_norm": 1.0356323599931876, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 20725 + }, + { + "epoch": 0.20726, + "grad_norm": 1.0625122756479055, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 20726 + }, + { + "epoch": 0.20727, + "grad_norm": 0.9313918687891721, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 20727 + }, + { + "epoch": 0.20728, + "grad_norm": 1.0000886729416245, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 20728 + }, + { + "epoch": 0.20729, + "grad_norm": 1.0193101696864726, + "learning_rate": 0.003, + "loss": 4.044, + "step": 20729 + }, + { + "epoch": 0.2073, + "grad_norm": 1.0133609674621558, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 20730 + }, + { + "epoch": 0.20731, + "grad_norm": 0.9585117508858823, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 20731 + }, + { + "epoch": 0.20732, + "grad_norm": 0.940756485622367, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 20732 + }, + { + "epoch": 0.20733, + "grad_norm": 0.9786370663315664, + "learning_rate": 0.003, + "loss": 4.044, + "step": 20733 + }, + { + "epoch": 0.20734, + "grad_norm": 0.8616271325318224, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 20734 + }, + { + "epoch": 0.20735, + "grad_norm": 0.7615164854628359, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 20735 + }, + { + "epoch": 0.20736, + "grad_norm": 0.7941263273306256, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 20736 + }, + { + "epoch": 0.20737, + "grad_norm": 0.9969172132139513, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 20737 + }, + { + "epoch": 0.20738, + "grad_norm": 1.5023736698642578, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 20738 + }, + { + "epoch": 0.20739, + "grad_norm": 0.9762028808361444, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 20739 + }, + { + "epoch": 0.2074, + "grad_norm": 1.1390590029131378, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 20740 + }, + { + "epoch": 0.20741, + "grad_norm": 1.0108577589696488, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 20741 + }, + { + "epoch": 0.20742, + "grad_norm": 0.8889441424104167, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 20742 + }, + { + "epoch": 0.20743, + "grad_norm": 0.9820342624142727, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 20743 + }, + { + "epoch": 0.20744, + "grad_norm": 1.2376835630873302, + "learning_rate": 0.003, + "loss": 4.1249, + "step": 20744 + }, + { + "epoch": 0.20745, + "grad_norm": 0.9350354190161171, + "learning_rate": 0.003, + "loss": 4.065, + "step": 20745 + }, + { + "epoch": 0.20746, + "grad_norm": 0.8964962553317527, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 20746 + }, + { + "epoch": 0.20747, + "grad_norm": 0.8783792688798422, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 20747 + }, + { + "epoch": 0.20748, + "grad_norm": 0.9841656727166727, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 20748 + }, + { + "epoch": 0.20749, + "grad_norm": 1.1104386965056001, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 20749 + }, + { + "epoch": 0.2075, + "grad_norm": 1.094278241959975, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 20750 + }, + { + "epoch": 0.20751, + "grad_norm": 1.1708732060101759, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 20751 + }, + { + "epoch": 0.20752, + "grad_norm": 0.9256835316651215, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 20752 + }, + { + "epoch": 0.20753, + "grad_norm": 0.8984418312953798, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 20753 + }, + { + "epoch": 0.20754, + "grad_norm": 0.9859929187322075, + "learning_rate": 0.003, + "loss": 4.085, + "step": 20754 + }, + { + "epoch": 0.20755, + "grad_norm": 1.0246863446222312, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 20755 + }, + { + "epoch": 0.20756, + "grad_norm": 0.9704673786656147, + "learning_rate": 0.003, + "loss": 4.076, + "step": 20756 + }, + { + "epoch": 0.20757, + "grad_norm": 1.060174963756663, + "learning_rate": 0.003, + "loss": 4.085, + "step": 20757 + }, + { + "epoch": 0.20758, + "grad_norm": 1.1987580364879207, + "learning_rate": 0.003, + "loss": 4.083, + "step": 20758 + }, + { + "epoch": 0.20759, + "grad_norm": 1.008741287140148, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 20759 + }, + { + "epoch": 0.2076, + "grad_norm": 0.9424250600937093, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 20760 + }, + { + "epoch": 0.20761, + "grad_norm": 1.0289292288640246, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 20761 + }, + { + "epoch": 0.20762, + "grad_norm": 0.9522234811529363, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 20762 + }, + { + "epoch": 0.20763, + "grad_norm": 1.0821446055005235, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 20763 + }, + { + "epoch": 0.20764, + "grad_norm": 1.2069188426787005, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 20764 + }, + { + "epoch": 0.20765, + "grad_norm": 1.0002832818009721, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 20765 + }, + { + "epoch": 0.20766, + "grad_norm": 1.0027780458531717, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 20766 + }, + { + "epoch": 0.20767, + "grad_norm": 1.0986168857699, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 20767 + }, + { + "epoch": 0.20768, + "grad_norm": 0.9867147085182186, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 20768 + }, + { + "epoch": 0.20769, + "grad_norm": 0.9468765183760098, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 20769 + }, + { + "epoch": 0.2077, + "grad_norm": 0.8015486553042366, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 20770 + }, + { + "epoch": 0.20771, + "grad_norm": 0.8833278320256701, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 20771 + }, + { + "epoch": 0.20772, + "grad_norm": 0.9759831595758212, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 20772 + }, + { + "epoch": 0.20773, + "grad_norm": 0.9466826472540202, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 20773 + }, + { + "epoch": 0.20774, + "grad_norm": 0.9070859077420977, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 20774 + }, + { + "epoch": 0.20775, + "grad_norm": 0.9272357848830094, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 20775 + }, + { + "epoch": 0.20776, + "grad_norm": 0.8951049696125486, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 20776 + }, + { + "epoch": 0.20777, + "grad_norm": 0.8373072634772573, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 20777 + }, + { + "epoch": 0.20778, + "grad_norm": 0.8185398936510428, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 20778 + }, + { + "epoch": 0.20779, + "grad_norm": 0.8774021381997962, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 20779 + }, + { + "epoch": 0.2078, + "grad_norm": 0.9529960509548837, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 20780 + }, + { + "epoch": 0.20781, + "grad_norm": 0.8872476092655752, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 20781 + }, + { + "epoch": 0.20782, + "grad_norm": 0.7204830834769962, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 20782 + }, + { + "epoch": 0.20783, + "grad_norm": 0.6801215653087698, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 20783 + }, + { + "epoch": 0.20784, + "grad_norm": 0.7711327109396316, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 20784 + }, + { + "epoch": 0.20785, + "grad_norm": 0.961085338661194, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 20785 + }, + { + "epoch": 0.20786, + "grad_norm": 1.2514225253609057, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 20786 + }, + { + "epoch": 0.20787, + "grad_norm": 0.7618441745477507, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 20787 + }, + { + "epoch": 0.20788, + "grad_norm": 0.7210302510964424, + "learning_rate": 0.003, + "loss": 4.03, + "step": 20788 + }, + { + "epoch": 0.20789, + "grad_norm": 0.8015987605300772, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 20789 + }, + { + "epoch": 0.2079, + "grad_norm": 0.8358797712113142, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 20790 + }, + { + "epoch": 0.20791, + "grad_norm": 0.7711390563496814, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 20791 + }, + { + "epoch": 0.20792, + "grad_norm": 0.7373600734332294, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 20792 + }, + { + "epoch": 0.20793, + "grad_norm": 0.8648692180455652, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 20793 + }, + { + "epoch": 0.20794, + "grad_norm": 1.1045574010554873, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 20794 + }, + { + "epoch": 0.20795, + "grad_norm": 0.8706162549835064, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 20795 + }, + { + "epoch": 0.20796, + "grad_norm": 0.7991366020970061, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 20796 + }, + { + "epoch": 0.20797, + "grad_norm": 0.8266699540161978, + "learning_rate": 0.003, + "loss": 4.0065, + "step": 20797 + }, + { + "epoch": 0.20798, + "grad_norm": 0.9295114312035573, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 20798 + }, + { + "epoch": 0.20799, + "grad_norm": 0.9722309746197256, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 20799 + }, + { + "epoch": 0.208, + "grad_norm": 1.0150228422407528, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 20800 + }, + { + "epoch": 0.20801, + "grad_norm": 0.9434807998329356, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 20801 + }, + { + "epoch": 0.20802, + "grad_norm": 0.8426632504188805, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 20802 + }, + { + "epoch": 0.20803, + "grad_norm": 0.7962692959280201, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 20803 + }, + { + "epoch": 0.20804, + "grad_norm": 0.8521264590982106, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 20804 + }, + { + "epoch": 0.20805, + "grad_norm": 0.9657034625579123, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 20805 + }, + { + "epoch": 0.20806, + "grad_norm": 1.1519085946825933, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 20806 + }, + { + "epoch": 0.20807, + "grad_norm": 0.8500282300755746, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 20807 + }, + { + "epoch": 0.20808, + "grad_norm": 0.8156829967817358, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 20808 + }, + { + "epoch": 0.20809, + "grad_norm": 0.7827086334718794, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 20809 + }, + { + "epoch": 0.2081, + "grad_norm": 0.6837126100099973, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 20810 + }, + { + "epoch": 0.20811, + "grad_norm": 0.6619164480276214, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 20811 + }, + { + "epoch": 0.20812, + "grad_norm": 0.6699766058255511, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 20812 + }, + { + "epoch": 0.20813, + "grad_norm": 0.8790057035161044, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 20813 + }, + { + "epoch": 0.20814, + "grad_norm": 1.0609515662283509, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 20814 + }, + { + "epoch": 0.20815, + "grad_norm": 0.9883373278839427, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 20815 + }, + { + "epoch": 0.20816, + "grad_norm": 1.0900471224543256, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 20816 + }, + { + "epoch": 0.20817, + "grad_norm": 0.8779952327281877, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 20817 + }, + { + "epoch": 0.20818, + "grad_norm": 0.7728878959603992, + "learning_rate": 0.003, + "loss": 4.086, + "step": 20818 + }, + { + "epoch": 0.20819, + "grad_norm": 0.7201337922741882, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 20819 + }, + { + "epoch": 0.2082, + "grad_norm": 0.7395626855942592, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 20820 + }, + { + "epoch": 0.20821, + "grad_norm": 0.7683142880475228, + "learning_rate": 0.003, + "loss": 4.032, + "step": 20821 + }, + { + "epoch": 0.20822, + "grad_norm": 0.8218967880588436, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 20822 + }, + { + "epoch": 0.20823, + "grad_norm": 0.834345833872554, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 20823 + }, + { + "epoch": 0.20824, + "grad_norm": 0.7987532170222379, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 20824 + }, + { + "epoch": 0.20825, + "grad_norm": 0.8198056489592254, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 20825 + }, + { + "epoch": 0.20826, + "grad_norm": 0.885014579488387, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 20826 + }, + { + "epoch": 0.20827, + "grad_norm": 0.9228575617691458, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 20827 + }, + { + "epoch": 0.20828, + "grad_norm": 0.8477871640540304, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 20828 + }, + { + "epoch": 0.20829, + "grad_norm": 0.9524373852750858, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 20829 + }, + { + "epoch": 0.2083, + "grad_norm": 1.1650023972190342, + "learning_rate": 0.003, + "loss": 4.044, + "step": 20830 + }, + { + "epoch": 0.20831, + "grad_norm": 0.8958374467271202, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 20831 + }, + { + "epoch": 0.20832, + "grad_norm": 0.7579417270438731, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 20832 + }, + { + "epoch": 0.20833, + "grad_norm": 0.6654744014024893, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 20833 + }, + { + "epoch": 0.20834, + "grad_norm": 0.7169839141950568, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 20834 + }, + { + "epoch": 0.20835, + "grad_norm": 0.9159520675068701, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 20835 + }, + { + "epoch": 0.20836, + "grad_norm": 0.8988485140413005, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 20836 + }, + { + "epoch": 0.20837, + "grad_norm": 0.8700377997356233, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 20837 + }, + { + "epoch": 0.20838, + "grad_norm": 1.0247823704201304, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 20838 + }, + { + "epoch": 0.20839, + "grad_norm": 1.2176239640475501, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 20839 + }, + { + "epoch": 0.2084, + "grad_norm": 0.8474460656441073, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 20840 + }, + { + "epoch": 0.20841, + "grad_norm": 0.7247828273171255, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 20841 + }, + { + "epoch": 0.20842, + "grad_norm": 0.6973333704945514, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 20842 + }, + { + "epoch": 0.20843, + "grad_norm": 0.6930687854800611, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 20843 + }, + { + "epoch": 0.20844, + "grad_norm": 0.7565297191579194, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 20844 + }, + { + "epoch": 0.20845, + "grad_norm": 0.8720648821399303, + "learning_rate": 0.003, + "loss": 4.056, + "step": 20845 + }, + { + "epoch": 0.20846, + "grad_norm": 1.1494438524283832, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 20846 + }, + { + "epoch": 0.20847, + "grad_norm": 1.0470518786789225, + "learning_rate": 0.003, + "loss": 4.085, + "step": 20847 + }, + { + "epoch": 0.20848, + "grad_norm": 1.0835417349275442, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 20848 + }, + { + "epoch": 0.20849, + "grad_norm": 0.940728512493772, + "learning_rate": 0.003, + "loss": 4.052, + "step": 20849 + }, + { + "epoch": 0.2085, + "grad_norm": 0.7481177236998657, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 20850 + }, + { + "epoch": 0.20851, + "grad_norm": 0.6206226004147564, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 20851 + }, + { + "epoch": 0.20852, + "grad_norm": 0.5888691845170154, + "learning_rate": 0.003, + "loss": 4.052, + "step": 20852 + }, + { + "epoch": 0.20853, + "grad_norm": 0.6709893240010038, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 20853 + }, + { + "epoch": 0.20854, + "grad_norm": 0.9154878899593396, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 20854 + }, + { + "epoch": 0.20855, + "grad_norm": 1.348181078472298, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 20855 + }, + { + "epoch": 0.20856, + "grad_norm": 0.533570289649261, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 20856 + }, + { + "epoch": 0.20857, + "grad_norm": 0.8123699800950966, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 20857 + }, + { + "epoch": 0.20858, + "grad_norm": 1.2357741301996246, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 20858 + }, + { + "epoch": 0.20859, + "grad_norm": 0.5919763432136748, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 20859 + }, + { + "epoch": 0.2086, + "grad_norm": 0.6796295592904262, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 20860 + }, + { + "epoch": 0.20861, + "grad_norm": 0.8091425758358133, + "learning_rate": 0.003, + "loss": 4.044, + "step": 20861 + }, + { + "epoch": 0.20862, + "grad_norm": 0.8879189719290468, + "learning_rate": 0.003, + "loss": 4.04, + "step": 20862 + }, + { + "epoch": 0.20863, + "grad_norm": 0.9195269575657935, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 20863 + }, + { + "epoch": 0.20864, + "grad_norm": 0.9066008738921145, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 20864 + }, + { + "epoch": 0.20865, + "grad_norm": 0.8895437059425502, + "learning_rate": 0.003, + "loss": 4.057, + "step": 20865 + }, + { + "epoch": 0.20866, + "grad_norm": 0.9063383514726553, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 20866 + }, + { + "epoch": 0.20867, + "grad_norm": 1.0377430733789423, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 20867 + }, + { + "epoch": 0.20868, + "grad_norm": 0.8914793363756183, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 20868 + }, + { + "epoch": 0.20869, + "grad_norm": 0.8682923363415345, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 20869 + }, + { + "epoch": 0.2087, + "grad_norm": 0.9380749540622735, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 20870 + }, + { + "epoch": 0.20871, + "grad_norm": 0.8936449518543564, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 20871 + }, + { + "epoch": 0.20872, + "grad_norm": 0.9068738242848167, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 20872 + }, + { + "epoch": 0.20873, + "grad_norm": 0.9227432132140321, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 20873 + }, + { + "epoch": 0.20874, + "grad_norm": 0.9128861571901247, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 20874 + }, + { + "epoch": 0.20875, + "grad_norm": 0.8761302133314882, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 20875 + }, + { + "epoch": 0.20876, + "grad_norm": 0.8366480631194192, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 20876 + }, + { + "epoch": 0.20877, + "grad_norm": 0.855363742188142, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 20877 + }, + { + "epoch": 0.20878, + "grad_norm": 0.7831765518582599, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 20878 + }, + { + "epoch": 0.20879, + "grad_norm": 0.8521269894108455, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 20879 + }, + { + "epoch": 0.2088, + "grad_norm": 0.9035866348006848, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 20880 + }, + { + "epoch": 0.20881, + "grad_norm": 0.9265370826142766, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 20881 + }, + { + "epoch": 0.20882, + "grad_norm": 1.0037908592074307, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 20882 + }, + { + "epoch": 0.20883, + "grad_norm": 0.968690414016621, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 20883 + }, + { + "epoch": 0.20884, + "grad_norm": 1.1660468868596874, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 20884 + }, + { + "epoch": 0.20885, + "grad_norm": 1.058396478086012, + "learning_rate": 0.003, + "loss": 4.066, + "step": 20885 + }, + { + "epoch": 0.20886, + "grad_norm": 0.9555840750202294, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 20886 + }, + { + "epoch": 0.20887, + "grad_norm": 1.1188187594505572, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 20887 + }, + { + "epoch": 0.20888, + "grad_norm": 0.9444405945730495, + "learning_rate": 0.003, + "loss": 4.086, + "step": 20888 + }, + { + "epoch": 0.20889, + "grad_norm": 0.9386917222804859, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 20889 + }, + { + "epoch": 0.2089, + "grad_norm": 0.8123897004406706, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 20890 + }, + { + "epoch": 0.20891, + "grad_norm": 0.7839225369108362, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 20891 + }, + { + "epoch": 0.20892, + "grad_norm": 0.8935177672635446, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 20892 + }, + { + "epoch": 0.20893, + "grad_norm": 0.9836856323206956, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 20893 + }, + { + "epoch": 0.20894, + "grad_norm": 1.1754153858999739, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 20894 + }, + { + "epoch": 0.20895, + "grad_norm": 0.7068288140116837, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 20895 + }, + { + "epoch": 0.20896, + "grad_norm": 0.6449442557927029, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 20896 + }, + { + "epoch": 0.20897, + "grad_norm": 0.6398501563580534, + "learning_rate": 0.003, + "loss": 4.045, + "step": 20897 + }, + { + "epoch": 0.20898, + "grad_norm": 0.5404930010042057, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 20898 + }, + { + "epoch": 0.20899, + "grad_norm": 0.5655871736814144, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 20899 + }, + { + "epoch": 0.209, + "grad_norm": 0.6020237435608026, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 20900 + }, + { + "epoch": 0.20901, + "grad_norm": 0.7601498429681562, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 20901 + }, + { + "epoch": 0.20902, + "grad_norm": 0.9586553021196883, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 20902 + }, + { + "epoch": 0.20903, + "grad_norm": 1.1292970133758855, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 20903 + }, + { + "epoch": 0.20904, + "grad_norm": 0.9060190545624713, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 20904 + }, + { + "epoch": 0.20905, + "grad_norm": 0.8036263350051671, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 20905 + }, + { + "epoch": 0.20906, + "grad_norm": 0.8514165614844775, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 20906 + }, + { + "epoch": 0.20907, + "grad_norm": 0.8891648021569436, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 20907 + }, + { + "epoch": 0.20908, + "grad_norm": 0.9041538277723808, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 20908 + }, + { + "epoch": 0.20909, + "grad_norm": 0.9657488087980987, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 20909 + }, + { + "epoch": 0.2091, + "grad_norm": 1.0294279686468277, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 20910 + }, + { + "epoch": 0.20911, + "grad_norm": 1.1272615296740602, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 20911 + }, + { + "epoch": 0.20912, + "grad_norm": 0.96115295435553, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 20912 + }, + { + "epoch": 0.20913, + "grad_norm": 0.867637661574733, + "learning_rate": 0.003, + "loss": 4.069, + "step": 20913 + }, + { + "epoch": 0.20914, + "grad_norm": 0.8872088685369496, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 20914 + }, + { + "epoch": 0.20915, + "grad_norm": 0.8862017167689719, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 20915 + }, + { + "epoch": 0.20916, + "grad_norm": 0.764827789701457, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 20916 + }, + { + "epoch": 0.20917, + "grad_norm": 0.6561755595633655, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 20917 + }, + { + "epoch": 0.20918, + "grad_norm": 0.6362220571951497, + "learning_rate": 0.003, + "loss": 4.051, + "step": 20918 + }, + { + "epoch": 0.20919, + "grad_norm": 0.760983641615835, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 20919 + }, + { + "epoch": 0.2092, + "grad_norm": 0.9662819849835595, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 20920 + }, + { + "epoch": 0.20921, + "grad_norm": 0.9497005033603845, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 20921 + }, + { + "epoch": 0.20922, + "grad_norm": 1.0256543509228493, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 20922 + }, + { + "epoch": 0.20923, + "grad_norm": 1.0288801321590189, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 20923 + }, + { + "epoch": 0.20924, + "grad_norm": 0.9575742206763543, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 20924 + }, + { + "epoch": 0.20925, + "grad_norm": 0.9518517727333672, + "learning_rate": 0.003, + "loss": 4.08, + "step": 20925 + }, + { + "epoch": 0.20926, + "grad_norm": 1.0101356656910154, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 20926 + }, + { + "epoch": 0.20927, + "grad_norm": 0.9208202329357542, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 20927 + }, + { + "epoch": 0.20928, + "grad_norm": 0.9321816400545891, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 20928 + }, + { + "epoch": 0.20929, + "grad_norm": 1.0111036566490585, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 20929 + }, + { + "epoch": 0.2093, + "grad_norm": 0.9735364164661735, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 20930 + }, + { + "epoch": 0.20931, + "grad_norm": 0.9459289793368022, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 20931 + }, + { + "epoch": 0.20932, + "grad_norm": 1.0179234302052216, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 20932 + }, + { + "epoch": 0.20933, + "grad_norm": 0.9970085209525912, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 20933 + }, + { + "epoch": 0.20934, + "grad_norm": 1.0353416538699811, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 20934 + }, + { + "epoch": 0.20935, + "grad_norm": 0.9682634285806023, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 20935 + }, + { + "epoch": 0.20936, + "grad_norm": 1.0529142887227032, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 20936 + }, + { + "epoch": 0.20937, + "grad_norm": 0.7584596088651663, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 20937 + }, + { + "epoch": 0.20938, + "grad_norm": 0.8028796370689205, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 20938 + }, + { + "epoch": 0.20939, + "grad_norm": 0.7215962925971722, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 20939 + }, + { + "epoch": 0.2094, + "grad_norm": 0.7726815801453961, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 20940 + }, + { + "epoch": 0.20941, + "grad_norm": 0.8234341681451163, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 20941 + }, + { + "epoch": 0.20942, + "grad_norm": 0.9294759080755696, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 20942 + }, + { + "epoch": 0.20943, + "grad_norm": 1.0540080989455443, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 20943 + }, + { + "epoch": 0.20944, + "grad_norm": 1.0771125126478065, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 20944 + }, + { + "epoch": 0.20945, + "grad_norm": 0.9606589362088807, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 20945 + }, + { + "epoch": 0.20946, + "grad_norm": 0.9138274590289285, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 20946 + }, + { + "epoch": 0.20947, + "grad_norm": 0.8336271922049947, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 20947 + }, + { + "epoch": 0.20948, + "grad_norm": 0.7548154530579022, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 20948 + }, + { + "epoch": 0.20949, + "grad_norm": 0.6556948254700824, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 20949 + }, + { + "epoch": 0.2095, + "grad_norm": 0.7527190926661659, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 20950 + }, + { + "epoch": 0.20951, + "grad_norm": 0.7849746003955136, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 20951 + }, + { + "epoch": 0.20952, + "grad_norm": 0.8720462836880155, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 20952 + }, + { + "epoch": 0.20953, + "grad_norm": 1.012691571410559, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 20953 + }, + { + "epoch": 0.20954, + "grad_norm": 0.9642645746189286, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 20954 + }, + { + "epoch": 0.20955, + "grad_norm": 0.906695269261336, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 20955 + }, + { + "epoch": 0.20956, + "grad_norm": 0.8938285305084355, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 20956 + }, + { + "epoch": 0.20957, + "grad_norm": 0.8915711865436335, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 20957 + }, + { + "epoch": 0.20958, + "grad_norm": 0.87353229798638, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 20958 + }, + { + "epoch": 0.20959, + "grad_norm": 0.9904221888857875, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 20959 + }, + { + "epoch": 0.2096, + "grad_norm": 1.2355844583948616, + "learning_rate": 0.003, + "loss": 4.059, + "step": 20960 + }, + { + "epoch": 0.20961, + "grad_norm": 0.9292678513392834, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 20961 + }, + { + "epoch": 0.20962, + "grad_norm": 0.899412887479403, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 20962 + }, + { + "epoch": 0.20963, + "grad_norm": 0.8005233489070702, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 20963 + }, + { + "epoch": 0.20964, + "grad_norm": 0.8393734088076394, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 20964 + }, + { + "epoch": 0.20965, + "grad_norm": 0.9801653239512133, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 20965 + }, + { + "epoch": 0.20966, + "grad_norm": 1.1035705657174708, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 20966 + }, + { + "epoch": 0.20967, + "grad_norm": 0.8951714877176081, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 20967 + }, + { + "epoch": 0.20968, + "grad_norm": 0.9984300213184893, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 20968 + }, + { + "epoch": 0.20969, + "grad_norm": 1.1447322626526333, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 20969 + }, + { + "epoch": 0.2097, + "grad_norm": 0.981719037133992, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 20970 + }, + { + "epoch": 0.20971, + "grad_norm": 0.8274862463046855, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 20971 + }, + { + "epoch": 0.20972, + "grad_norm": 0.6057424137656625, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 20972 + }, + { + "epoch": 0.20973, + "grad_norm": 0.6194221064418588, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 20973 + }, + { + "epoch": 0.20974, + "grad_norm": 0.7136364923130752, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 20974 + }, + { + "epoch": 0.20975, + "grad_norm": 0.7920099308835159, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 20975 + }, + { + "epoch": 0.20976, + "grad_norm": 0.8529938394212799, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 20976 + }, + { + "epoch": 0.20977, + "grad_norm": 0.7476903419204045, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 20977 + }, + { + "epoch": 0.20978, + "grad_norm": 0.7445633455785335, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 20978 + }, + { + "epoch": 0.20979, + "grad_norm": 0.8800103611750361, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 20979 + }, + { + "epoch": 0.2098, + "grad_norm": 0.7821239446012019, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 20980 + }, + { + "epoch": 0.20981, + "grad_norm": 0.7517569464820335, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 20981 + }, + { + "epoch": 0.20982, + "grad_norm": 0.9657130463328368, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 20982 + }, + { + "epoch": 0.20983, + "grad_norm": 1.0867425041569467, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 20983 + }, + { + "epoch": 0.20984, + "grad_norm": 1.0630272947915531, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 20984 + }, + { + "epoch": 0.20985, + "grad_norm": 1.0346921683585952, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 20985 + }, + { + "epoch": 0.20986, + "grad_norm": 1.0325122421335289, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 20986 + }, + { + "epoch": 0.20987, + "grad_norm": 1.033786469848335, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 20987 + }, + { + "epoch": 0.20988, + "grad_norm": 0.940109667780733, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 20988 + }, + { + "epoch": 0.20989, + "grad_norm": 1.0250377702460591, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 20989 + }, + { + "epoch": 0.2099, + "grad_norm": 0.9870851149142993, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 20990 + }, + { + "epoch": 0.20991, + "grad_norm": 1.0900184477933434, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 20991 + }, + { + "epoch": 0.20992, + "grad_norm": 0.9224192173796866, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 20992 + }, + { + "epoch": 0.20993, + "grad_norm": 0.7347907145935488, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 20993 + }, + { + "epoch": 0.20994, + "grad_norm": 0.6494314159332513, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 20994 + }, + { + "epoch": 0.20995, + "grad_norm": 0.6223911937898218, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 20995 + }, + { + "epoch": 0.20996, + "grad_norm": 0.5943403133762641, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 20996 + }, + { + "epoch": 0.20997, + "grad_norm": 0.6413854579845952, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 20997 + }, + { + "epoch": 0.20998, + "grad_norm": 0.7006561359930732, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 20998 + }, + { + "epoch": 0.20999, + "grad_norm": 0.682415504085104, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 20999 + }, + { + "epoch": 0.21, + "grad_norm": 0.6998869133750484, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 21000 + }, + { + "epoch": 0.21001, + "grad_norm": 0.6435399026341386, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 21001 + }, + { + "epoch": 0.21002, + "grad_norm": 0.6273031068741116, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 21002 + }, + { + "epoch": 0.21003, + "grad_norm": 0.673517232813251, + "learning_rate": 0.003, + "loss": 4.042, + "step": 21003 + }, + { + "epoch": 0.21004, + "grad_norm": 0.7430702012339945, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 21004 + }, + { + "epoch": 0.21005, + "grad_norm": 0.9560227725435448, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 21005 + }, + { + "epoch": 0.21006, + "grad_norm": 1.2456053948463115, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 21006 + }, + { + "epoch": 0.21007, + "grad_norm": 1.0358346003790344, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 21007 + }, + { + "epoch": 0.21008, + "grad_norm": 0.9437335127085855, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 21008 + }, + { + "epoch": 0.21009, + "grad_norm": 0.9332984345554216, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 21009 + }, + { + "epoch": 0.2101, + "grad_norm": 0.9413725326339256, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 21010 + }, + { + "epoch": 0.21011, + "grad_norm": 1.1666496645699866, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 21011 + }, + { + "epoch": 0.21012, + "grad_norm": 1.0288992373381118, + "learning_rate": 0.003, + "loss": 4.076, + "step": 21012 + }, + { + "epoch": 0.21013, + "grad_norm": 0.8636091001155501, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 21013 + }, + { + "epoch": 0.21014, + "grad_norm": 0.8403243600059537, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 21014 + }, + { + "epoch": 0.21015, + "grad_norm": 0.8784863249597779, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 21015 + }, + { + "epoch": 0.21016, + "grad_norm": 0.8631408808007526, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 21016 + }, + { + "epoch": 0.21017, + "grad_norm": 0.8644516790702244, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 21017 + }, + { + "epoch": 0.21018, + "grad_norm": 0.9616570865165733, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 21018 + }, + { + "epoch": 0.21019, + "grad_norm": 0.9906264791766691, + "learning_rate": 0.003, + "loss": 4.046, + "step": 21019 + }, + { + "epoch": 0.2102, + "grad_norm": 1.1804328901765606, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 21020 + }, + { + "epoch": 0.21021, + "grad_norm": 1.0531486649930049, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 21021 + }, + { + "epoch": 0.21022, + "grad_norm": 1.084725270812928, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 21022 + }, + { + "epoch": 0.21023, + "grad_norm": 1.0324941560813041, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 21023 + }, + { + "epoch": 0.21024, + "grad_norm": 1.0161370054174468, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 21024 + }, + { + "epoch": 0.21025, + "grad_norm": 1.0137450360714135, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 21025 + }, + { + "epoch": 0.21026, + "grad_norm": 1.1833859414069834, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 21026 + }, + { + "epoch": 0.21027, + "grad_norm": 0.8923078275673527, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 21027 + }, + { + "epoch": 0.21028, + "grad_norm": 0.8130443215360488, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 21028 + }, + { + "epoch": 0.21029, + "grad_norm": 0.7670036528086152, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 21029 + }, + { + "epoch": 0.2103, + "grad_norm": 0.6912671653028333, + "learning_rate": 0.003, + "loss": 4.068, + "step": 21030 + }, + { + "epoch": 0.21031, + "grad_norm": 0.6086060166202992, + "learning_rate": 0.003, + "loss": 4.048, + "step": 21031 + }, + { + "epoch": 0.21032, + "grad_norm": 0.7017603978899594, + "learning_rate": 0.003, + "loss": 4.049, + "step": 21032 + }, + { + "epoch": 0.21033, + "grad_norm": 0.7622072450090098, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 21033 + }, + { + "epoch": 0.21034, + "grad_norm": 0.8642013213533124, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 21034 + }, + { + "epoch": 0.21035, + "grad_norm": 1.0029696165418491, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 21035 + }, + { + "epoch": 0.21036, + "grad_norm": 1.2489051096803996, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 21036 + }, + { + "epoch": 0.21037, + "grad_norm": 0.7941122063837073, + "learning_rate": 0.003, + "loss": 4.056, + "step": 21037 + }, + { + "epoch": 0.21038, + "grad_norm": 0.822774509536195, + "learning_rate": 0.003, + "loss": 4.072, + "step": 21038 + }, + { + "epoch": 0.21039, + "grad_norm": 0.9504645683506947, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 21039 + }, + { + "epoch": 0.2104, + "grad_norm": 1.2007252057555626, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 21040 + }, + { + "epoch": 0.21041, + "grad_norm": 0.8525491884398844, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 21041 + }, + { + "epoch": 0.21042, + "grad_norm": 0.7650256027036704, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 21042 + }, + { + "epoch": 0.21043, + "grad_norm": 0.7397310280269651, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 21043 + }, + { + "epoch": 0.21044, + "grad_norm": 0.7557234288434997, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 21044 + }, + { + "epoch": 0.21045, + "grad_norm": 0.8213142899806527, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 21045 + }, + { + "epoch": 0.21046, + "grad_norm": 0.7997509105781899, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 21046 + }, + { + "epoch": 0.21047, + "grad_norm": 0.7419456621563829, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 21047 + }, + { + "epoch": 0.21048, + "grad_norm": 0.8268305174669067, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 21048 + }, + { + "epoch": 0.21049, + "grad_norm": 1.0048050806880642, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 21049 + }, + { + "epoch": 0.2105, + "grad_norm": 1.0325392092503214, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 21050 + }, + { + "epoch": 0.21051, + "grad_norm": 0.8314267179215811, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 21051 + }, + { + "epoch": 0.21052, + "grad_norm": 0.786931387085538, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 21052 + }, + { + "epoch": 0.21053, + "grad_norm": 0.698446646816597, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 21053 + }, + { + "epoch": 0.21054, + "grad_norm": 0.7073659733355145, + "learning_rate": 0.003, + "loss": 4.071, + "step": 21054 + }, + { + "epoch": 0.21055, + "grad_norm": 0.7162249843379157, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 21055 + }, + { + "epoch": 0.21056, + "grad_norm": 0.7590214084202431, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 21056 + }, + { + "epoch": 0.21057, + "grad_norm": 0.7979945720370192, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 21057 + }, + { + "epoch": 0.21058, + "grad_norm": 0.8619043099997161, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 21058 + }, + { + "epoch": 0.21059, + "grad_norm": 1.101043783627659, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 21059 + }, + { + "epoch": 0.2106, + "grad_norm": 1.071976055417973, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 21060 + }, + { + "epoch": 0.21061, + "grad_norm": 0.9616464727901872, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 21061 + }, + { + "epoch": 0.21062, + "grad_norm": 1.016752772330011, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 21062 + }, + { + "epoch": 0.21063, + "grad_norm": 1.0098029454164281, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 21063 + }, + { + "epoch": 0.21064, + "grad_norm": 0.9269076398128128, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 21064 + }, + { + "epoch": 0.21065, + "grad_norm": 0.8422468539490227, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 21065 + }, + { + "epoch": 0.21066, + "grad_norm": 0.8183083063942778, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 21066 + }, + { + "epoch": 0.21067, + "grad_norm": 0.7730981172085367, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 21067 + }, + { + "epoch": 0.21068, + "grad_norm": 0.8351167347595602, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 21068 + }, + { + "epoch": 0.21069, + "grad_norm": 0.7623075012114217, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 21069 + }, + { + "epoch": 0.2107, + "grad_norm": 0.811489738558358, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 21070 + }, + { + "epoch": 0.21071, + "grad_norm": 0.7183944988996523, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 21071 + }, + { + "epoch": 0.21072, + "grad_norm": 0.7226136180078927, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 21072 + }, + { + "epoch": 0.21073, + "grad_norm": 0.9121160681331745, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 21073 + }, + { + "epoch": 0.21074, + "grad_norm": 1.1824481576173265, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 21074 + }, + { + "epoch": 0.21075, + "grad_norm": 1.0709399945555922, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 21075 + }, + { + "epoch": 0.21076, + "grad_norm": 1.1417092377978277, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 21076 + }, + { + "epoch": 0.21077, + "grad_norm": 0.9127570254238125, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 21077 + }, + { + "epoch": 0.21078, + "grad_norm": 0.7998226678057007, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 21078 + }, + { + "epoch": 0.21079, + "grad_norm": 0.735479029798332, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 21079 + }, + { + "epoch": 0.2108, + "grad_norm": 0.9112384260161761, + "learning_rate": 0.003, + "loss": 4.017, + "step": 21080 + }, + { + "epoch": 0.21081, + "grad_norm": 1.050632586819599, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 21081 + }, + { + "epoch": 0.21082, + "grad_norm": 1.0501659096074498, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 21082 + }, + { + "epoch": 0.21083, + "grad_norm": 1.014205008967147, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 21083 + }, + { + "epoch": 0.21084, + "grad_norm": 0.9561378141575905, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 21084 + }, + { + "epoch": 0.21085, + "grad_norm": 0.9482915838054728, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 21085 + }, + { + "epoch": 0.21086, + "grad_norm": 1.0836474842428978, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 21086 + }, + { + "epoch": 0.21087, + "grad_norm": 0.9506211186558229, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 21087 + }, + { + "epoch": 0.21088, + "grad_norm": 0.9675611930060828, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 21088 + }, + { + "epoch": 0.21089, + "grad_norm": 1.0861416219497653, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 21089 + }, + { + "epoch": 0.2109, + "grad_norm": 1.026866491003027, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 21090 + }, + { + "epoch": 0.21091, + "grad_norm": 1.1113761885995883, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 21091 + }, + { + "epoch": 0.21092, + "grad_norm": 0.782651591909814, + "learning_rate": 0.003, + "loss": 4.078, + "step": 21092 + }, + { + "epoch": 0.21093, + "grad_norm": 0.6976526250302678, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 21093 + }, + { + "epoch": 0.21094, + "grad_norm": 0.7023845356491255, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 21094 + }, + { + "epoch": 0.21095, + "grad_norm": 0.6796931345316704, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 21095 + }, + { + "epoch": 0.21096, + "grad_norm": 0.6974224560446238, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 21096 + }, + { + "epoch": 0.21097, + "grad_norm": 0.8763911602676883, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 21097 + }, + { + "epoch": 0.21098, + "grad_norm": 1.1301164496685343, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 21098 + }, + { + "epoch": 0.21099, + "grad_norm": 1.0560107088308537, + "learning_rate": 0.003, + "loss": 4.04, + "step": 21099 + }, + { + "epoch": 0.211, + "grad_norm": 0.914678192060454, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 21100 + }, + { + "epoch": 0.21101, + "grad_norm": 0.728814810016415, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 21101 + }, + { + "epoch": 0.21102, + "grad_norm": 0.7043960115867001, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 21102 + }, + { + "epoch": 0.21103, + "grad_norm": 0.731339414118106, + "learning_rate": 0.003, + "loss": 4.058, + "step": 21103 + }, + { + "epoch": 0.21104, + "grad_norm": 0.6503725794176464, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 21104 + }, + { + "epoch": 0.21105, + "grad_norm": 0.5908475393905657, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 21105 + }, + { + "epoch": 0.21106, + "grad_norm": 0.6944105738869012, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 21106 + }, + { + "epoch": 0.21107, + "grad_norm": 0.6728185090033827, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 21107 + }, + { + "epoch": 0.21108, + "grad_norm": 0.6387129313466028, + "learning_rate": 0.003, + "loss": 4.052, + "step": 21108 + }, + { + "epoch": 0.21109, + "grad_norm": 0.8395486307179711, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 21109 + }, + { + "epoch": 0.2111, + "grad_norm": 1.032022023581922, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 21110 + }, + { + "epoch": 0.21111, + "grad_norm": 1.2141059703309585, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 21111 + }, + { + "epoch": 0.21112, + "grad_norm": 1.003124856862386, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 21112 + }, + { + "epoch": 0.21113, + "grad_norm": 1.2313198953043571, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 21113 + }, + { + "epoch": 0.21114, + "grad_norm": 0.9446560106026629, + "learning_rate": 0.003, + "loss": 4.08, + "step": 21114 + }, + { + "epoch": 0.21115, + "grad_norm": 0.9690466848006403, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 21115 + }, + { + "epoch": 0.21116, + "grad_norm": 0.7765049882661763, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 21116 + }, + { + "epoch": 0.21117, + "grad_norm": 0.7674281129106878, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 21117 + }, + { + "epoch": 0.21118, + "grad_norm": 0.8050658312568353, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 21118 + }, + { + "epoch": 0.21119, + "grad_norm": 0.9527115024060966, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 21119 + }, + { + "epoch": 0.2112, + "grad_norm": 1.0664148839093905, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 21120 + }, + { + "epoch": 0.21121, + "grad_norm": 0.8989309264153537, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 21121 + }, + { + "epoch": 0.21122, + "grad_norm": 0.8553488932637456, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 21122 + }, + { + "epoch": 0.21123, + "grad_norm": 0.8530798206832544, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 21123 + }, + { + "epoch": 0.21124, + "grad_norm": 0.8607838825788022, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 21124 + }, + { + "epoch": 0.21125, + "grad_norm": 0.8716327668518643, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 21125 + }, + { + "epoch": 0.21126, + "grad_norm": 0.9891438273656992, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 21126 + }, + { + "epoch": 0.21127, + "grad_norm": 1.1312964391301088, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 21127 + }, + { + "epoch": 0.21128, + "grad_norm": 0.8552845659556313, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 21128 + }, + { + "epoch": 0.21129, + "grad_norm": 0.7829051331214251, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 21129 + }, + { + "epoch": 0.2113, + "grad_norm": 0.8032688655307006, + "learning_rate": 0.003, + "loss": 4.056, + "step": 21130 + }, + { + "epoch": 0.21131, + "grad_norm": 0.887240068118571, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 21131 + }, + { + "epoch": 0.21132, + "grad_norm": 1.0255753122391946, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 21132 + }, + { + "epoch": 0.21133, + "grad_norm": 0.9770052068304652, + "learning_rate": 0.003, + "loss": 4.027, + "step": 21133 + }, + { + "epoch": 0.21134, + "grad_norm": 1.0838121625918715, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 21134 + }, + { + "epoch": 0.21135, + "grad_norm": 1.0212083668102394, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 21135 + }, + { + "epoch": 0.21136, + "grad_norm": 0.9957979334382595, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 21136 + }, + { + "epoch": 0.21137, + "grad_norm": 0.9654371177807874, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 21137 + }, + { + "epoch": 0.21138, + "grad_norm": 0.9935918355132888, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 21138 + }, + { + "epoch": 0.21139, + "grad_norm": 0.9798029322391021, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 21139 + }, + { + "epoch": 0.2114, + "grad_norm": 1.0888142750803371, + "learning_rate": 0.003, + "loss": 4.079, + "step": 21140 + }, + { + "epoch": 0.21141, + "grad_norm": 0.8104995200408619, + "learning_rate": 0.003, + "loss": 4.05, + "step": 21141 + }, + { + "epoch": 0.21142, + "grad_norm": 0.7112571958199563, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 21142 + }, + { + "epoch": 0.21143, + "grad_norm": 0.7255826242780293, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 21143 + }, + { + "epoch": 0.21144, + "grad_norm": 0.793102625204424, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 21144 + }, + { + "epoch": 0.21145, + "grad_norm": 0.8317932635144787, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 21145 + }, + { + "epoch": 0.21146, + "grad_norm": 0.8967437236032432, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 21146 + }, + { + "epoch": 0.21147, + "grad_norm": 0.8759587248159304, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 21147 + }, + { + "epoch": 0.21148, + "grad_norm": 0.8234335022952023, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 21148 + }, + { + "epoch": 0.21149, + "grad_norm": 0.8347779727113863, + "learning_rate": 0.003, + "loss": 4.091, + "step": 21149 + }, + { + "epoch": 0.2115, + "grad_norm": 0.8771653489074704, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 21150 + }, + { + "epoch": 0.21151, + "grad_norm": 0.8868562373035072, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 21151 + }, + { + "epoch": 0.21152, + "grad_norm": 0.9260194636969789, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 21152 + }, + { + "epoch": 0.21153, + "grad_norm": 1.0278428326891138, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 21153 + }, + { + "epoch": 0.21154, + "grad_norm": 0.8406737049790429, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 21154 + }, + { + "epoch": 0.21155, + "grad_norm": 0.7115851023266901, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 21155 + }, + { + "epoch": 0.21156, + "grad_norm": 0.6527447656067576, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 21156 + }, + { + "epoch": 0.21157, + "grad_norm": 0.8198752729573484, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 21157 + }, + { + "epoch": 0.21158, + "grad_norm": 0.9566103901226111, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 21158 + }, + { + "epoch": 0.21159, + "grad_norm": 1.0908284159853958, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 21159 + }, + { + "epoch": 0.2116, + "grad_norm": 0.9359106029478939, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 21160 + }, + { + "epoch": 0.21161, + "grad_norm": 0.864010193572614, + "learning_rate": 0.003, + "loss": 4.047, + "step": 21161 + }, + { + "epoch": 0.21162, + "grad_norm": 0.8075852395167353, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 21162 + }, + { + "epoch": 0.21163, + "grad_norm": 0.9827122186692935, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 21163 + }, + { + "epoch": 0.21164, + "grad_norm": 1.2260172194200103, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 21164 + }, + { + "epoch": 0.21165, + "grad_norm": 0.8469270663970344, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 21165 + }, + { + "epoch": 0.21166, + "grad_norm": 0.7684862743742701, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 21166 + }, + { + "epoch": 0.21167, + "grad_norm": 0.7819136830951748, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 21167 + }, + { + "epoch": 0.21168, + "grad_norm": 0.8139741309161088, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 21168 + }, + { + "epoch": 0.21169, + "grad_norm": 0.9138785383857093, + "learning_rate": 0.003, + "loss": 4.059, + "step": 21169 + }, + { + "epoch": 0.2117, + "grad_norm": 0.9330372982311411, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 21170 + }, + { + "epoch": 0.21171, + "grad_norm": 0.8708103617659637, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 21171 + }, + { + "epoch": 0.21172, + "grad_norm": 0.9418160284270444, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 21172 + }, + { + "epoch": 0.21173, + "grad_norm": 1.0376885824291926, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 21173 + }, + { + "epoch": 0.21174, + "grad_norm": 0.9404880236615147, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 21174 + }, + { + "epoch": 0.21175, + "grad_norm": 0.9934125181461049, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 21175 + }, + { + "epoch": 0.21176, + "grad_norm": 1.1353149684021964, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 21176 + }, + { + "epoch": 0.21177, + "grad_norm": 0.8201070903363143, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 21177 + }, + { + "epoch": 0.21178, + "grad_norm": 0.8173249937295153, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 21178 + }, + { + "epoch": 0.21179, + "grad_norm": 0.8997426303253961, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 21179 + }, + { + "epoch": 0.2118, + "grad_norm": 0.9041938656269032, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 21180 + }, + { + "epoch": 0.21181, + "grad_norm": 0.8587673964737166, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 21181 + }, + { + "epoch": 0.21182, + "grad_norm": 1.0796240432190018, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 21182 + }, + { + "epoch": 0.21183, + "grad_norm": 1.0340901099432858, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 21183 + }, + { + "epoch": 0.21184, + "grad_norm": 1.0284079048161205, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 21184 + }, + { + "epoch": 0.21185, + "grad_norm": 0.9837865666573926, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 21185 + }, + { + "epoch": 0.21186, + "grad_norm": 0.999957853922835, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 21186 + }, + { + "epoch": 0.21187, + "grad_norm": 1.0867239755988904, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 21187 + }, + { + "epoch": 0.21188, + "grad_norm": 0.8751720863136792, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 21188 + }, + { + "epoch": 0.21189, + "grad_norm": 0.7883284206571608, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 21189 + }, + { + "epoch": 0.2119, + "grad_norm": 0.833843271179596, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 21190 + }, + { + "epoch": 0.21191, + "grad_norm": 0.7945471805342696, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 21191 + }, + { + "epoch": 0.21192, + "grad_norm": 0.859568000810372, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 21192 + }, + { + "epoch": 0.21193, + "grad_norm": 0.9499322342318987, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 21193 + }, + { + "epoch": 0.21194, + "grad_norm": 1.1699675423333233, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 21194 + }, + { + "epoch": 0.21195, + "grad_norm": 0.8798374398677336, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 21195 + }, + { + "epoch": 0.21196, + "grad_norm": 0.8420315326759298, + "learning_rate": 0.003, + "loss": 4.075, + "step": 21196 + }, + { + "epoch": 0.21197, + "grad_norm": 0.7435566165737909, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 21197 + }, + { + "epoch": 0.21198, + "grad_norm": 0.790755597947907, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 21198 + }, + { + "epoch": 0.21199, + "grad_norm": 0.7636089927077947, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 21199 + }, + { + "epoch": 0.212, + "grad_norm": 0.736372345884053, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 21200 + }, + { + "epoch": 0.21201, + "grad_norm": 0.7987946497201371, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 21201 + }, + { + "epoch": 0.21202, + "grad_norm": 1.0621123139084045, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 21202 + }, + { + "epoch": 0.21203, + "grad_norm": 1.0043085045508686, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 21203 + }, + { + "epoch": 0.21204, + "grad_norm": 0.9689772473591405, + "learning_rate": 0.003, + "loss": 4.064, + "step": 21204 + }, + { + "epoch": 0.21205, + "grad_norm": 0.8777533125102754, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 21205 + }, + { + "epoch": 0.21206, + "grad_norm": 0.9267300075194698, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 21206 + }, + { + "epoch": 0.21207, + "grad_norm": 0.9623997678273425, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 21207 + }, + { + "epoch": 0.21208, + "grad_norm": 1.0258782821472325, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 21208 + }, + { + "epoch": 0.21209, + "grad_norm": 0.9868977199698608, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 21209 + }, + { + "epoch": 0.2121, + "grad_norm": 0.8702205824102925, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 21210 + }, + { + "epoch": 0.21211, + "grad_norm": 0.9318447210972713, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 21211 + }, + { + "epoch": 0.21212, + "grad_norm": 1.080939522993914, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 21212 + }, + { + "epoch": 0.21213, + "grad_norm": 1.0355856763240312, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 21213 + }, + { + "epoch": 0.21214, + "grad_norm": 1.0599578015664297, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 21214 + }, + { + "epoch": 0.21215, + "grad_norm": 0.8878829110296694, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 21215 + }, + { + "epoch": 0.21216, + "grad_norm": 0.7177601093066023, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 21216 + }, + { + "epoch": 0.21217, + "grad_norm": 0.7111340447626696, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 21217 + }, + { + "epoch": 0.21218, + "grad_norm": 0.7273870077396913, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 21218 + }, + { + "epoch": 0.21219, + "grad_norm": 0.716870960657489, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 21219 + }, + { + "epoch": 0.2122, + "grad_norm": 0.7547271994958835, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 21220 + }, + { + "epoch": 0.21221, + "grad_norm": 0.9258241847127848, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 21221 + }, + { + "epoch": 0.21222, + "grad_norm": 1.084845220900056, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 21222 + }, + { + "epoch": 0.21223, + "grad_norm": 1.0505118625844345, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 21223 + }, + { + "epoch": 0.21224, + "grad_norm": 0.9754180762791759, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 21224 + }, + { + "epoch": 0.21225, + "grad_norm": 1.1567928169511146, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 21225 + }, + { + "epoch": 0.21226, + "grad_norm": 1.100104882338202, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 21226 + }, + { + "epoch": 0.21227, + "grad_norm": 0.7957564511547448, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 21227 + }, + { + "epoch": 0.21228, + "grad_norm": 0.7075555210106396, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 21228 + }, + { + "epoch": 0.21229, + "grad_norm": 0.7652379480633924, + "learning_rate": 0.003, + "loss": 4.06, + "step": 21229 + }, + { + "epoch": 0.2123, + "grad_norm": 0.7981211742797875, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 21230 + }, + { + "epoch": 0.21231, + "grad_norm": 0.8404954939470825, + "learning_rate": 0.003, + "loss": 4.072, + "step": 21231 + }, + { + "epoch": 0.21232, + "grad_norm": 0.8486995279467043, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 21232 + }, + { + "epoch": 0.21233, + "grad_norm": 1.0226108619112986, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 21233 + }, + { + "epoch": 0.21234, + "grad_norm": 1.0786944266582215, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 21234 + }, + { + "epoch": 0.21235, + "grad_norm": 0.8269548201711181, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 21235 + }, + { + "epoch": 0.21236, + "grad_norm": 0.7566574607208635, + "learning_rate": 0.003, + "loss": 4.034, + "step": 21236 + }, + { + "epoch": 0.21237, + "grad_norm": 0.8561778377388048, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 21237 + }, + { + "epoch": 0.21238, + "grad_norm": 0.9009112049680555, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 21238 + }, + { + "epoch": 0.21239, + "grad_norm": 0.7869095725484021, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 21239 + }, + { + "epoch": 0.2124, + "grad_norm": 0.8015076597896063, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 21240 + }, + { + "epoch": 0.21241, + "grad_norm": 0.8916400926794642, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 21241 + }, + { + "epoch": 0.21242, + "grad_norm": 0.8593105568320265, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 21242 + }, + { + "epoch": 0.21243, + "grad_norm": 0.8642499020925598, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 21243 + }, + { + "epoch": 0.21244, + "grad_norm": 0.9351452208459016, + "learning_rate": 0.003, + "loss": 4.05, + "step": 21244 + }, + { + "epoch": 0.21245, + "grad_norm": 1.051505354109061, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 21245 + }, + { + "epoch": 0.21246, + "grad_norm": 1.211082749679639, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 21246 + }, + { + "epoch": 0.21247, + "grad_norm": 1.0324620044571742, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 21247 + }, + { + "epoch": 0.21248, + "grad_norm": 1.0528376039558933, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 21248 + }, + { + "epoch": 0.21249, + "grad_norm": 1.0172951092601181, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 21249 + }, + { + "epoch": 0.2125, + "grad_norm": 1.0455916547903843, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 21250 + }, + { + "epoch": 0.21251, + "grad_norm": 0.9721666841995168, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 21251 + }, + { + "epoch": 0.21252, + "grad_norm": 0.9354980752446461, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 21252 + }, + { + "epoch": 0.21253, + "grad_norm": 0.8235369177814564, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 21253 + }, + { + "epoch": 0.21254, + "grad_norm": 0.708241936578851, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 21254 + }, + { + "epoch": 0.21255, + "grad_norm": 0.6590801615768045, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 21255 + }, + { + "epoch": 0.21256, + "grad_norm": 0.7687869672904036, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 21256 + }, + { + "epoch": 0.21257, + "grad_norm": 0.8110294457074578, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 21257 + }, + { + "epoch": 0.21258, + "grad_norm": 0.8835741457620595, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 21258 + }, + { + "epoch": 0.21259, + "grad_norm": 1.031106931791354, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 21259 + }, + { + "epoch": 0.2126, + "grad_norm": 1.0822589123455826, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 21260 + }, + { + "epoch": 0.21261, + "grad_norm": 0.8988172268989614, + "learning_rate": 0.003, + "loss": 4.062, + "step": 21261 + }, + { + "epoch": 0.21262, + "grad_norm": 0.7603041652393627, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 21262 + }, + { + "epoch": 0.21263, + "grad_norm": 0.6376354168758694, + "learning_rate": 0.003, + "loss": 4.0042, + "step": 21263 + }, + { + "epoch": 0.21264, + "grad_norm": 0.6667641683867043, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 21264 + }, + { + "epoch": 0.21265, + "grad_norm": 0.6712455135629539, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 21265 + }, + { + "epoch": 0.21266, + "grad_norm": 0.6568901608609755, + "learning_rate": 0.003, + "loss": 4.042, + "step": 21266 + }, + { + "epoch": 0.21267, + "grad_norm": 0.6299236679437372, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 21267 + }, + { + "epoch": 0.21268, + "grad_norm": 0.6619955714541537, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 21268 + }, + { + "epoch": 0.21269, + "grad_norm": 0.7720190038476618, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 21269 + }, + { + "epoch": 0.2127, + "grad_norm": 0.83960982461192, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 21270 + }, + { + "epoch": 0.21271, + "grad_norm": 0.816094079275652, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 21271 + }, + { + "epoch": 0.21272, + "grad_norm": 0.9558392509854234, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 21272 + }, + { + "epoch": 0.21273, + "grad_norm": 1.192041026982684, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 21273 + }, + { + "epoch": 0.21274, + "grad_norm": 0.8334068208891056, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 21274 + }, + { + "epoch": 0.21275, + "grad_norm": 0.8147451132296317, + "learning_rate": 0.003, + "loss": 4.008, + "step": 21275 + }, + { + "epoch": 0.21276, + "grad_norm": 0.8069330768413078, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 21276 + }, + { + "epoch": 0.21277, + "grad_norm": 0.9188295233850001, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 21277 + }, + { + "epoch": 0.21278, + "grad_norm": 1.23921945828254, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 21278 + }, + { + "epoch": 0.21279, + "grad_norm": 1.0243127214157668, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 21279 + }, + { + "epoch": 0.2128, + "grad_norm": 0.949600296006374, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 21280 + }, + { + "epoch": 0.21281, + "grad_norm": 0.8667360246932553, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 21281 + }, + { + "epoch": 0.21282, + "grad_norm": 0.8882339822228139, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 21282 + }, + { + "epoch": 0.21283, + "grad_norm": 0.8440891879162166, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 21283 + }, + { + "epoch": 0.21284, + "grad_norm": 0.8432786676373919, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 21284 + }, + { + "epoch": 0.21285, + "grad_norm": 0.9727308305242739, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 21285 + }, + { + "epoch": 0.21286, + "grad_norm": 1.1983776814240794, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 21286 + }, + { + "epoch": 0.21287, + "grad_norm": 0.8130550644223, + "learning_rate": 0.003, + "loss": 4.046, + "step": 21287 + }, + { + "epoch": 0.21288, + "grad_norm": 0.7421248628961178, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 21288 + }, + { + "epoch": 0.21289, + "grad_norm": 0.8242295095790686, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 21289 + }, + { + "epoch": 0.2129, + "grad_norm": 0.8853489494645336, + "learning_rate": 0.003, + "loss": 4.024, + "step": 21290 + }, + { + "epoch": 0.21291, + "grad_norm": 1.0966030917056349, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 21291 + }, + { + "epoch": 0.21292, + "grad_norm": 1.1923335315819226, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 21292 + }, + { + "epoch": 0.21293, + "grad_norm": 0.8737413966416677, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 21293 + }, + { + "epoch": 0.21294, + "grad_norm": 0.8072464878105456, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 21294 + }, + { + "epoch": 0.21295, + "grad_norm": 0.8197551822012741, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 21295 + }, + { + "epoch": 0.21296, + "grad_norm": 0.7666384684898897, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 21296 + }, + { + "epoch": 0.21297, + "grad_norm": 0.8659945444239481, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 21297 + }, + { + "epoch": 0.21298, + "grad_norm": 1.1660665394360092, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 21298 + }, + { + "epoch": 0.21299, + "grad_norm": 1.023780513710772, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 21299 + }, + { + "epoch": 0.213, + "grad_norm": 1.0678845657179334, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 21300 + }, + { + "epoch": 0.21301, + "grad_norm": 0.863857021155737, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 21301 + }, + { + "epoch": 0.21302, + "grad_norm": 0.775615516091285, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 21302 + }, + { + "epoch": 0.21303, + "grad_norm": 0.7506875613355735, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 21303 + }, + { + "epoch": 0.21304, + "grad_norm": 0.7135119295797271, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 21304 + }, + { + "epoch": 0.21305, + "grad_norm": 0.7035218754708156, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 21305 + }, + { + "epoch": 0.21306, + "grad_norm": 0.6711849400837023, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 21306 + }, + { + "epoch": 0.21307, + "grad_norm": 0.7781252451033537, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 21307 + }, + { + "epoch": 0.21308, + "grad_norm": 0.9939189260062887, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 21308 + }, + { + "epoch": 0.21309, + "grad_norm": 1.156881142245501, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 21309 + }, + { + "epoch": 0.2131, + "grad_norm": 0.7792490413634692, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 21310 + }, + { + "epoch": 0.21311, + "grad_norm": 0.7319875650903732, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 21311 + }, + { + "epoch": 0.21312, + "grad_norm": 0.795710184351996, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 21312 + }, + { + "epoch": 0.21313, + "grad_norm": 0.7345238860166494, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 21313 + }, + { + "epoch": 0.21314, + "grad_norm": 0.7004808014406678, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 21314 + }, + { + "epoch": 0.21315, + "grad_norm": 0.7031313239306645, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 21315 + }, + { + "epoch": 0.21316, + "grad_norm": 0.8075956495669555, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 21316 + }, + { + "epoch": 0.21317, + "grad_norm": 0.9973050178910247, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 21317 + }, + { + "epoch": 0.21318, + "grad_norm": 1.3681928645941812, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 21318 + }, + { + "epoch": 0.21319, + "grad_norm": 0.7330198948212318, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 21319 + }, + { + "epoch": 0.2132, + "grad_norm": 0.7739497247246485, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 21320 + }, + { + "epoch": 0.21321, + "grad_norm": 0.8460461446146745, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 21321 + }, + { + "epoch": 0.21322, + "grad_norm": 0.9394888310669375, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 21322 + }, + { + "epoch": 0.21323, + "grad_norm": 1.1570726057632688, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 21323 + }, + { + "epoch": 0.21324, + "grad_norm": 0.8497915856023048, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 21324 + }, + { + "epoch": 0.21325, + "grad_norm": 0.8038646541488577, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 21325 + }, + { + "epoch": 0.21326, + "grad_norm": 0.8961067657186734, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 21326 + }, + { + "epoch": 0.21327, + "grad_norm": 1.0219810982118778, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 21327 + }, + { + "epoch": 0.21328, + "grad_norm": 1.1051294523482973, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 21328 + }, + { + "epoch": 0.21329, + "grad_norm": 0.9889025080245338, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 21329 + }, + { + "epoch": 0.2133, + "grad_norm": 1.0100949131470505, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 21330 + }, + { + "epoch": 0.21331, + "grad_norm": 1.126585164504275, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 21331 + }, + { + "epoch": 0.21332, + "grad_norm": 0.7962512890266764, + "learning_rate": 0.003, + "loss": 4.068, + "step": 21332 + }, + { + "epoch": 0.21333, + "grad_norm": 0.8522845771627242, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 21333 + }, + { + "epoch": 0.21334, + "grad_norm": 0.9526029103214415, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 21334 + }, + { + "epoch": 0.21335, + "grad_norm": 1.1669842014286143, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 21335 + }, + { + "epoch": 0.21336, + "grad_norm": 0.8933792119256343, + "learning_rate": 0.003, + "loss": 4.056, + "step": 21336 + }, + { + "epoch": 0.21337, + "grad_norm": 0.877876816362361, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 21337 + }, + { + "epoch": 0.21338, + "grad_norm": 0.9273135760349166, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 21338 + }, + { + "epoch": 0.21339, + "grad_norm": 0.9318291343925488, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 21339 + }, + { + "epoch": 0.2134, + "grad_norm": 0.8403932154757081, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 21340 + }, + { + "epoch": 0.21341, + "grad_norm": 0.8695387354890809, + "learning_rate": 0.003, + "loss": 4.076, + "step": 21341 + }, + { + "epoch": 0.21342, + "grad_norm": 0.9400148271112372, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 21342 + }, + { + "epoch": 0.21343, + "grad_norm": 1.113601714658504, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 21343 + }, + { + "epoch": 0.21344, + "grad_norm": 1.0866489269900732, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 21344 + }, + { + "epoch": 0.21345, + "grad_norm": 1.1229349825443453, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 21345 + }, + { + "epoch": 0.21346, + "grad_norm": 0.9405376207001583, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 21346 + }, + { + "epoch": 0.21347, + "grad_norm": 0.9500346961923676, + "learning_rate": 0.003, + "loss": 4.078, + "step": 21347 + }, + { + "epoch": 0.21348, + "grad_norm": 0.8218563266258336, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 21348 + }, + { + "epoch": 0.21349, + "grad_norm": 0.834416220996816, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 21349 + }, + { + "epoch": 0.2135, + "grad_norm": 0.9078981301095694, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 21350 + }, + { + "epoch": 0.21351, + "grad_norm": 1.1390345127660626, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 21351 + }, + { + "epoch": 0.21352, + "grad_norm": 0.9086358048852432, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 21352 + }, + { + "epoch": 0.21353, + "grad_norm": 0.8204596465265185, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 21353 + }, + { + "epoch": 0.21354, + "grad_norm": 0.9019404314427433, + "learning_rate": 0.003, + "loss": 4.069, + "step": 21354 + }, + { + "epoch": 0.21355, + "grad_norm": 0.920367402924881, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 21355 + }, + { + "epoch": 0.21356, + "grad_norm": 0.7973367884096413, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 21356 + }, + { + "epoch": 0.21357, + "grad_norm": 0.6930357605376251, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 21357 + }, + { + "epoch": 0.21358, + "grad_norm": 0.7407767863293523, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 21358 + }, + { + "epoch": 0.21359, + "grad_norm": 0.7842652627237436, + "learning_rate": 0.003, + "loss": 4.044, + "step": 21359 + }, + { + "epoch": 0.2136, + "grad_norm": 0.8332080172008113, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 21360 + }, + { + "epoch": 0.21361, + "grad_norm": 0.885481222691971, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 21361 + }, + { + "epoch": 0.21362, + "grad_norm": 1.0102925759314678, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 21362 + }, + { + "epoch": 0.21363, + "grad_norm": 1.0928436872355154, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 21363 + }, + { + "epoch": 0.21364, + "grad_norm": 0.8767967390102701, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 21364 + }, + { + "epoch": 0.21365, + "grad_norm": 0.6746446662770155, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 21365 + }, + { + "epoch": 0.21366, + "grad_norm": 0.6755843158706554, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 21366 + }, + { + "epoch": 0.21367, + "grad_norm": 0.8169999232168187, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 21367 + }, + { + "epoch": 0.21368, + "grad_norm": 0.9405999873733871, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 21368 + }, + { + "epoch": 0.21369, + "grad_norm": 0.9740985947277164, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 21369 + }, + { + "epoch": 0.2137, + "grad_norm": 1.049346979694071, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 21370 + }, + { + "epoch": 0.21371, + "grad_norm": 1.0640850585130066, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 21371 + }, + { + "epoch": 0.21372, + "grad_norm": 0.957192180995113, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 21372 + }, + { + "epoch": 0.21373, + "grad_norm": 1.1119459871247441, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 21373 + }, + { + "epoch": 0.21374, + "grad_norm": 1.0553609199919132, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 21374 + }, + { + "epoch": 0.21375, + "grad_norm": 0.905381816938977, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 21375 + }, + { + "epoch": 0.21376, + "grad_norm": 0.9007522996477779, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 21376 + }, + { + "epoch": 0.21377, + "grad_norm": 0.9671815182500294, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 21377 + }, + { + "epoch": 0.21378, + "grad_norm": 0.9794147638607366, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 21378 + }, + { + "epoch": 0.21379, + "grad_norm": 1.0823450455423465, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 21379 + }, + { + "epoch": 0.2138, + "grad_norm": 1.0776837298983957, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 21380 + }, + { + "epoch": 0.21381, + "grad_norm": 1.0550059105757812, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 21381 + }, + { + "epoch": 0.21382, + "grad_norm": 0.9060408240886062, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 21382 + }, + { + "epoch": 0.21383, + "grad_norm": 0.9105263545879362, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 21383 + }, + { + "epoch": 0.21384, + "grad_norm": 0.9325095562625706, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 21384 + }, + { + "epoch": 0.21385, + "grad_norm": 0.9105796920224166, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 21385 + }, + { + "epoch": 0.21386, + "grad_norm": 1.0527195723375458, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 21386 + }, + { + "epoch": 0.21387, + "grad_norm": 0.8948238423275834, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 21387 + }, + { + "epoch": 0.21388, + "grad_norm": 0.800240153649954, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 21388 + }, + { + "epoch": 0.21389, + "grad_norm": 0.8274793267588422, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 21389 + }, + { + "epoch": 0.2139, + "grad_norm": 0.6110452753450674, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 21390 + }, + { + "epoch": 0.21391, + "grad_norm": 0.5946604858150054, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 21391 + }, + { + "epoch": 0.21392, + "grad_norm": 0.6960747948357103, + "learning_rate": 0.003, + "loss": 4.05, + "step": 21392 + }, + { + "epoch": 0.21393, + "grad_norm": 0.7462711901382988, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 21393 + }, + { + "epoch": 0.21394, + "grad_norm": 0.733471496702728, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 21394 + }, + { + "epoch": 0.21395, + "grad_norm": 0.7577502191514479, + "learning_rate": 0.003, + "loss": 4.049, + "step": 21395 + }, + { + "epoch": 0.21396, + "grad_norm": 0.840421108050357, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 21396 + }, + { + "epoch": 0.21397, + "grad_norm": 0.818519133768956, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 21397 + }, + { + "epoch": 0.21398, + "grad_norm": 0.8779364785315508, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 21398 + }, + { + "epoch": 0.21399, + "grad_norm": 1.059892231131895, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 21399 + }, + { + "epoch": 0.214, + "grad_norm": 1.1741248691709623, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 21400 + }, + { + "epoch": 0.21401, + "grad_norm": 0.8686644036094342, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 21401 + }, + { + "epoch": 0.21402, + "grad_norm": 0.8992972114790656, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 21402 + }, + { + "epoch": 0.21403, + "grad_norm": 0.8911434120168006, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 21403 + }, + { + "epoch": 0.21404, + "grad_norm": 0.7706015238868186, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 21404 + }, + { + "epoch": 0.21405, + "grad_norm": 0.7255141803490246, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 21405 + }, + { + "epoch": 0.21406, + "grad_norm": 0.7647138842330985, + "learning_rate": 0.003, + "loss": 4.032, + "step": 21406 + }, + { + "epoch": 0.21407, + "grad_norm": 0.7389923966534094, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 21407 + }, + { + "epoch": 0.21408, + "grad_norm": 0.7087570950921596, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 21408 + }, + { + "epoch": 0.21409, + "grad_norm": 0.6780210754694448, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 21409 + }, + { + "epoch": 0.2141, + "grad_norm": 0.876655044563196, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 21410 + }, + { + "epoch": 0.21411, + "grad_norm": 0.9919748322359662, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 21411 + }, + { + "epoch": 0.21412, + "grad_norm": 1.2636542450740191, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 21412 + }, + { + "epoch": 0.21413, + "grad_norm": 0.7131346267936488, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 21413 + }, + { + "epoch": 0.21414, + "grad_norm": 0.6322143293499282, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 21414 + }, + { + "epoch": 0.21415, + "grad_norm": 0.7819501573375415, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 21415 + }, + { + "epoch": 0.21416, + "grad_norm": 0.9014886808134205, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 21416 + }, + { + "epoch": 0.21417, + "grad_norm": 1.2190233411001883, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 21417 + }, + { + "epoch": 0.21418, + "grad_norm": 0.8659790766710892, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 21418 + }, + { + "epoch": 0.21419, + "grad_norm": 0.8135526653004657, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 21419 + }, + { + "epoch": 0.2142, + "grad_norm": 0.8568164621978299, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 21420 + }, + { + "epoch": 0.21421, + "grad_norm": 0.8490283973014067, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 21421 + }, + { + "epoch": 0.21422, + "grad_norm": 0.8670183618441724, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 21422 + }, + { + "epoch": 0.21423, + "grad_norm": 0.8008910809403382, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 21423 + }, + { + "epoch": 0.21424, + "grad_norm": 0.731955970182008, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 21424 + }, + { + "epoch": 0.21425, + "grad_norm": 0.6740513560649253, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 21425 + }, + { + "epoch": 0.21426, + "grad_norm": 0.7676704833964977, + "learning_rate": 0.003, + "loss": 4.072, + "step": 21426 + }, + { + "epoch": 0.21427, + "grad_norm": 0.97995262106898, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 21427 + }, + { + "epoch": 0.21428, + "grad_norm": 1.3215832951005382, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 21428 + }, + { + "epoch": 0.21429, + "grad_norm": 0.9560479202046296, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 21429 + }, + { + "epoch": 0.2143, + "grad_norm": 1.0841098992244047, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 21430 + }, + { + "epoch": 0.21431, + "grad_norm": 1.055062025591932, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 21431 + }, + { + "epoch": 0.21432, + "grad_norm": 1.024259057986223, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 21432 + }, + { + "epoch": 0.21433, + "grad_norm": 0.8386995142459611, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 21433 + }, + { + "epoch": 0.21434, + "grad_norm": 0.8696884076101957, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 21434 + }, + { + "epoch": 0.21435, + "grad_norm": 0.8539816541452752, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 21435 + }, + { + "epoch": 0.21436, + "grad_norm": 0.8098803658264432, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 21436 + }, + { + "epoch": 0.21437, + "grad_norm": 0.9616615687849573, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 21437 + }, + { + "epoch": 0.21438, + "grad_norm": 1.1090659610752636, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 21438 + }, + { + "epoch": 0.21439, + "grad_norm": 0.8833350970630555, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 21439 + }, + { + "epoch": 0.2144, + "grad_norm": 0.9382105328542503, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 21440 + }, + { + "epoch": 0.21441, + "grad_norm": 1.0334773875653411, + "learning_rate": 0.003, + "loss": 4.043, + "step": 21441 + }, + { + "epoch": 0.21442, + "grad_norm": 1.0881440523884154, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 21442 + }, + { + "epoch": 0.21443, + "grad_norm": 1.0229790824041554, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 21443 + }, + { + "epoch": 0.21444, + "grad_norm": 0.8141416583645512, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 21444 + }, + { + "epoch": 0.21445, + "grad_norm": 0.7861323514029146, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 21445 + }, + { + "epoch": 0.21446, + "grad_norm": 0.810458938640283, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 21446 + }, + { + "epoch": 0.21447, + "grad_norm": 1.0473292887194876, + "learning_rate": 0.003, + "loss": 4.035, + "step": 21447 + }, + { + "epoch": 0.21448, + "grad_norm": 1.2288446034500558, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 21448 + }, + { + "epoch": 0.21449, + "grad_norm": 0.8277477834621442, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 21449 + }, + { + "epoch": 0.2145, + "grad_norm": 0.7543736141945366, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 21450 + }, + { + "epoch": 0.21451, + "grad_norm": 0.7154744939626335, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 21451 + }, + { + "epoch": 0.21452, + "grad_norm": 0.7533594485150341, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 21452 + }, + { + "epoch": 0.21453, + "grad_norm": 0.8395348351440081, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 21453 + }, + { + "epoch": 0.21454, + "grad_norm": 0.8361094780878154, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 21454 + }, + { + "epoch": 0.21455, + "grad_norm": 0.8511347730897709, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 21455 + }, + { + "epoch": 0.21456, + "grad_norm": 0.8775439946918151, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 21456 + }, + { + "epoch": 0.21457, + "grad_norm": 0.827383981504461, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 21457 + }, + { + "epoch": 0.21458, + "grad_norm": 1.112083865598023, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 21458 + }, + { + "epoch": 0.21459, + "grad_norm": 1.1616812671504075, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 21459 + }, + { + "epoch": 0.2146, + "grad_norm": 0.8078230457548922, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 21460 + }, + { + "epoch": 0.21461, + "grad_norm": 0.7022166266473627, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 21461 + }, + { + "epoch": 0.21462, + "grad_norm": 0.7230187039787708, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 21462 + }, + { + "epoch": 0.21463, + "grad_norm": 0.8674659440770295, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 21463 + }, + { + "epoch": 0.21464, + "grad_norm": 1.0916279614881734, + "learning_rate": 0.003, + "loss": 4.076, + "step": 21464 + }, + { + "epoch": 0.21465, + "grad_norm": 0.9465329070055324, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 21465 + }, + { + "epoch": 0.21466, + "grad_norm": 0.9964260038663965, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 21466 + }, + { + "epoch": 0.21467, + "grad_norm": 1.1221401265737272, + "learning_rate": 0.003, + "loss": 4.083, + "step": 21467 + }, + { + "epoch": 0.21468, + "grad_norm": 0.8291025254374909, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 21468 + }, + { + "epoch": 0.21469, + "grad_norm": 0.8477495422477623, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 21469 + }, + { + "epoch": 0.2147, + "grad_norm": 0.8807154171946066, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 21470 + }, + { + "epoch": 0.21471, + "grad_norm": 0.9242952518551273, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 21471 + }, + { + "epoch": 0.21472, + "grad_norm": 0.8877809205405255, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 21472 + }, + { + "epoch": 0.21473, + "grad_norm": 0.8806846354548536, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 21473 + }, + { + "epoch": 0.21474, + "grad_norm": 0.8608948945127014, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 21474 + }, + { + "epoch": 0.21475, + "grad_norm": 0.7386160940354448, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 21475 + }, + { + "epoch": 0.21476, + "grad_norm": 0.8520807385198372, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 21476 + }, + { + "epoch": 0.21477, + "grad_norm": 0.9312350048914875, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 21477 + }, + { + "epoch": 0.21478, + "grad_norm": 1.0285513080363073, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 21478 + }, + { + "epoch": 0.21479, + "grad_norm": 1.1856745324025493, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 21479 + }, + { + "epoch": 0.2148, + "grad_norm": 1.0755530879024613, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 21480 + }, + { + "epoch": 0.21481, + "grad_norm": 0.9067248080300517, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 21481 + }, + { + "epoch": 0.21482, + "grad_norm": 0.9503152864041258, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 21482 + }, + { + "epoch": 0.21483, + "grad_norm": 1.0586304737700027, + "learning_rate": 0.003, + "loss": 4.065, + "step": 21483 + }, + { + "epoch": 0.21484, + "grad_norm": 0.9660453492246902, + "learning_rate": 0.003, + "loss": 4.072, + "step": 21484 + }, + { + "epoch": 0.21485, + "grad_norm": 0.8018934200619793, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 21485 + }, + { + "epoch": 0.21486, + "grad_norm": 0.6957883514573063, + "learning_rate": 0.003, + "loss": 4.081, + "step": 21486 + }, + { + "epoch": 0.21487, + "grad_norm": 0.6737104591167143, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 21487 + }, + { + "epoch": 0.21488, + "grad_norm": 0.6925581284879242, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 21488 + }, + { + "epoch": 0.21489, + "grad_norm": 0.7557232532101538, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 21489 + }, + { + "epoch": 0.2149, + "grad_norm": 0.8963622020095259, + "learning_rate": 0.003, + "loss": 4.059, + "step": 21490 + }, + { + "epoch": 0.21491, + "grad_norm": 1.102121592063994, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 21491 + }, + { + "epoch": 0.21492, + "grad_norm": 0.7762725138771334, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 21492 + }, + { + "epoch": 0.21493, + "grad_norm": 0.7680256194872983, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 21493 + }, + { + "epoch": 0.21494, + "grad_norm": 0.8702451922180889, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 21494 + }, + { + "epoch": 0.21495, + "grad_norm": 0.848564649327761, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 21495 + }, + { + "epoch": 0.21496, + "grad_norm": 0.9102315469982839, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 21496 + }, + { + "epoch": 0.21497, + "grad_norm": 1.0822172076151446, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 21497 + }, + { + "epoch": 0.21498, + "grad_norm": 1.0227927189533406, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 21498 + }, + { + "epoch": 0.21499, + "grad_norm": 0.9203883246793417, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 21499 + }, + { + "epoch": 0.215, + "grad_norm": 0.9763742479663114, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 21500 + }, + { + "epoch": 0.21501, + "grad_norm": 0.9834431876388631, + "learning_rate": 0.003, + "loss": 4.085, + "step": 21501 + }, + { + "epoch": 0.21502, + "grad_norm": 1.1189385906350726, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 21502 + }, + { + "epoch": 0.21503, + "grad_norm": 0.9885298973184874, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 21503 + }, + { + "epoch": 0.21504, + "grad_norm": 1.128632283203252, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 21504 + }, + { + "epoch": 0.21505, + "grad_norm": 0.9284090737407498, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 21505 + }, + { + "epoch": 0.21506, + "grad_norm": 0.8118828230908535, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 21506 + }, + { + "epoch": 0.21507, + "grad_norm": 0.6797074208566537, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 21507 + }, + { + "epoch": 0.21508, + "grad_norm": 0.7031775790983548, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 21508 + }, + { + "epoch": 0.21509, + "grad_norm": 0.7191577393702786, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 21509 + }, + { + "epoch": 0.2151, + "grad_norm": 0.700204199089381, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 21510 + }, + { + "epoch": 0.21511, + "grad_norm": 0.6202389829836995, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 21511 + }, + { + "epoch": 0.21512, + "grad_norm": 0.5687590389877023, + "learning_rate": 0.003, + "loss": 4.0026, + "step": 21512 + }, + { + "epoch": 0.21513, + "grad_norm": 0.5963252648663018, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 21513 + }, + { + "epoch": 0.21514, + "grad_norm": 0.586654520117425, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 21514 + }, + { + "epoch": 0.21515, + "grad_norm": 0.637903565746543, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 21515 + }, + { + "epoch": 0.21516, + "grad_norm": 0.701677471648405, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 21516 + }, + { + "epoch": 0.21517, + "grad_norm": 0.822810744954545, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 21517 + }, + { + "epoch": 0.21518, + "grad_norm": 1.1199938157365958, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 21518 + }, + { + "epoch": 0.21519, + "grad_norm": 1.3110957899877198, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 21519 + }, + { + "epoch": 0.2152, + "grad_norm": 0.6399605418274914, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 21520 + }, + { + "epoch": 0.21521, + "grad_norm": 0.701215838253175, + "learning_rate": 0.003, + "loss": 4.046, + "step": 21521 + }, + { + "epoch": 0.21522, + "grad_norm": 0.785436350636527, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 21522 + }, + { + "epoch": 0.21523, + "grad_norm": 0.8871016659784559, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 21523 + }, + { + "epoch": 0.21524, + "grad_norm": 0.8941368839002816, + "learning_rate": 0.003, + "loss": 4.064, + "step": 21524 + }, + { + "epoch": 0.21525, + "grad_norm": 0.9401662392758682, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 21525 + }, + { + "epoch": 0.21526, + "grad_norm": 0.9725199864135584, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 21526 + }, + { + "epoch": 0.21527, + "grad_norm": 1.0258967049026004, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 21527 + }, + { + "epoch": 0.21528, + "grad_norm": 0.9167330372871186, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 21528 + }, + { + "epoch": 0.21529, + "grad_norm": 0.9036623471506209, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 21529 + }, + { + "epoch": 0.2153, + "grad_norm": 0.9834314313741844, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 21530 + }, + { + "epoch": 0.21531, + "grad_norm": 0.9625753642740854, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 21531 + }, + { + "epoch": 0.21532, + "grad_norm": 0.9023678816797029, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 21532 + }, + { + "epoch": 0.21533, + "grad_norm": 0.8407633518286873, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 21533 + }, + { + "epoch": 0.21534, + "grad_norm": 0.9129817559963305, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 21534 + }, + { + "epoch": 0.21535, + "grad_norm": 1.256952888320241, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 21535 + }, + { + "epoch": 0.21536, + "grad_norm": 0.9992239122319129, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 21536 + }, + { + "epoch": 0.21537, + "grad_norm": 0.8984541090414525, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 21537 + }, + { + "epoch": 0.21538, + "grad_norm": 0.7979672264163751, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 21538 + }, + { + "epoch": 0.21539, + "grad_norm": 0.7996716650769474, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 21539 + }, + { + "epoch": 0.2154, + "grad_norm": 0.9718535858845164, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 21540 + }, + { + "epoch": 0.21541, + "grad_norm": 1.1305992578210367, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 21541 + }, + { + "epoch": 0.21542, + "grad_norm": 1.2332343700140111, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 21542 + }, + { + "epoch": 0.21543, + "grad_norm": 0.7642761200892136, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 21543 + }, + { + "epoch": 0.21544, + "grad_norm": 0.6267605744638748, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 21544 + }, + { + "epoch": 0.21545, + "grad_norm": 0.6889176470588887, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 21545 + }, + { + "epoch": 0.21546, + "grad_norm": 0.7668689788665622, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 21546 + }, + { + "epoch": 0.21547, + "grad_norm": 0.7800768554991903, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 21547 + }, + { + "epoch": 0.21548, + "grad_norm": 0.8047459837679152, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 21548 + }, + { + "epoch": 0.21549, + "grad_norm": 0.8408625066215695, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 21549 + }, + { + "epoch": 0.2155, + "grad_norm": 0.8056977882994107, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 21550 + }, + { + "epoch": 0.21551, + "grad_norm": 0.7898326719010477, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 21551 + }, + { + "epoch": 0.21552, + "grad_norm": 0.7888800636508843, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 21552 + }, + { + "epoch": 0.21553, + "grad_norm": 0.863964246220263, + "learning_rate": 0.003, + "loss": 4.04, + "step": 21553 + }, + { + "epoch": 0.21554, + "grad_norm": 0.9936442170022771, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 21554 + }, + { + "epoch": 0.21555, + "grad_norm": 1.0209067339858764, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 21555 + }, + { + "epoch": 0.21556, + "grad_norm": 0.918394813071859, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 21556 + }, + { + "epoch": 0.21557, + "grad_norm": 0.841641707812552, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 21557 + }, + { + "epoch": 0.21558, + "grad_norm": 0.8963472011396417, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 21558 + }, + { + "epoch": 0.21559, + "grad_norm": 0.8973791510759853, + "learning_rate": 0.003, + "loss": 4.087, + "step": 21559 + }, + { + "epoch": 0.2156, + "grad_norm": 1.073379763863376, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 21560 + }, + { + "epoch": 0.21561, + "grad_norm": 1.1069851196346872, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 21561 + }, + { + "epoch": 0.21562, + "grad_norm": 1.1147197803258264, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 21562 + }, + { + "epoch": 0.21563, + "grad_norm": 0.8682319839410553, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 21563 + }, + { + "epoch": 0.21564, + "grad_norm": 0.7881534960003074, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 21564 + }, + { + "epoch": 0.21565, + "grad_norm": 0.7243852866001149, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 21565 + }, + { + "epoch": 0.21566, + "grad_norm": 0.8044460801035058, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 21566 + }, + { + "epoch": 0.21567, + "grad_norm": 0.9610971237661414, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 21567 + }, + { + "epoch": 0.21568, + "grad_norm": 1.0299650809931202, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 21568 + }, + { + "epoch": 0.21569, + "grad_norm": 1.1203253930132897, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 21569 + }, + { + "epoch": 0.2157, + "grad_norm": 0.8847565450160156, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 21570 + }, + { + "epoch": 0.21571, + "grad_norm": 0.7899478323008584, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 21571 + }, + { + "epoch": 0.21572, + "grad_norm": 0.7697418053044778, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 21572 + }, + { + "epoch": 0.21573, + "grad_norm": 0.6961944058455329, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 21573 + }, + { + "epoch": 0.21574, + "grad_norm": 0.7714756346724448, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 21574 + }, + { + "epoch": 0.21575, + "grad_norm": 0.808352357277223, + "learning_rate": 0.003, + "loss": 4.063, + "step": 21575 + }, + { + "epoch": 0.21576, + "grad_norm": 0.8327116352852698, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 21576 + }, + { + "epoch": 0.21577, + "grad_norm": 0.9962676451452657, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 21577 + }, + { + "epoch": 0.21578, + "grad_norm": 1.2953923746605633, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 21578 + }, + { + "epoch": 0.21579, + "grad_norm": 0.69211197211348, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 21579 + }, + { + "epoch": 0.2158, + "grad_norm": 0.7359911437887672, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 21580 + }, + { + "epoch": 0.21581, + "grad_norm": 0.9178731069503201, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 21581 + }, + { + "epoch": 0.21582, + "grad_norm": 1.0667355843827313, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 21582 + }, + { + "epoch": 0.21583, + "grad_norm": 1.1037629429987468, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 21583 + }, + { + "epoch": 0.21584, + "grad_norm": 1.0563403491958434, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 21584 + }, + { + "epoch": 0.21585, + "grad_norm": 1.066679010727024, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 21585 + }, + { + "epoch": 0.21586, + "grad_norm": 1.04442020629008, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 21586 + }, + { + "epoch": 0.21587, + "grad_norm": 0.8937336884621242, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 21587 + }, + { + "epoch": 0.21588, + "grad_norm": 0.881480218029145, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 21588 + }, + { + "epoch": 0.21589, + "grad_norm": 0.747716770035972, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 21589 + }, + { + "epoch": 0.2159, + "grad_norm": 0.7978472509505494, + "learning_rate": 0.003, + "loss": 4.043, + "step": 21590 + }, + { + "epoch": 0.21591, + "grad_norm": 0.7609638617708171, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 21591 + }, + { + "epoch": 0.21592, + "grad_norm": 0.8658713023705344, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 21592 + }, + { + "epoch": 0.21593, + "grad_norm": 0.9922656287287943, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 21593 + }, + { + "epoch": 0.21594, + "grad_norm": 1.0059345342528614, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 21594 + }, + { + "epoch": 0.21595, + "grad_norm": 1.1250739057614159, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 21595 + }, + { + "epoch": 0.21596, + "grad_norm": 0.8976772724944533, + "learning_rate": 0.003, + "loss": 4.042, + "step": 21596 + }, + { + "epoch": 0.21597, + "grad_norm": 0.9098047129674696, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 21597 + }, + { + "epoch": 0.21598, + "grad_norm": 1.0585225519338008, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 21598 + }, + { + "epoch": 0.21599, + "grad_norm": 1.1396511428165137, + "learning_rate": 0.003, + "loss": 4.057, + "step": 21599 + }, + { + "epoch": 0.216, + "grad_norm": 0.8774084028092378, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 21600 + }, + { + "epoch": 0.21601, + "grad_norm": 0.8939184951426141, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 21601 + }, + { + "epoch": 0.21602, + "grad_norm": 0.811550140556642, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 21602 + }, + { + "epoch": 0.21603, + "grad_norm": 0.6851775951282392, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 21603 + }, + { + "epoch": 0.21604, + "grad_norm": 0.7080152332646765, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 21604 + }, + { + "epoch": 0.21605, + "grad_norm": 0.7462907969939713, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 21605 + }, + { + "epoch": 0.21606, + "grad_norm": 0.7348892141682789, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 21606 + }, + { + "epoch": 0.21607, + "grad_norm": 0.9285085750475869, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 21607 + }, + { + "epoch": 0.21608, + "grad_norm": 1.2740961615973008, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 21608 + }, + { + "epoch": 0.21609, + "grad_norm": 0.8588938489802174, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 21609 + }, + { + "epoch": 0.2161, + "grad_norm": 0.8117988589784464, + "learning_rate": 0.003, + "loss": 4.037, + "step": 21610 + }, + { + "epoch": 0.21611, + "grad_norm": 0.8068452726889168, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 21611 + }, + { + "epoch": 0.21612, + "grad_norm": 0.7470045278346455, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 21612 + }, + { + "epoch": 0.21613, + "grad_norm": 0.7622859900911136, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 21613 + }, + { + "epoch": 0.21614, + "grad_norm": 0.887045992715797, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 21614 + }, + { + "epoch": 0.21615, + "grad_norm": 0.9104412327953356, + "learning_rate": 0.003, + "loss": 4.061, + "step": 21615 + }, + { + "epoch": 0.21616, + "grad_norm": 0.8559730237146324, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 21616 + }, + { + "epoch": 0.21617, + "grad_norm": 1.0582391420907316, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 21617 + }, + { + "epoch": 0.21618, + "grad_norm": 1.0484971344165914, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 21618 + }, + { + "epoch": 0.21619, + "grad_norm": 1.1274502375560465, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 21619 + }, + { + "epoch": 0.2162, + "grad_norm": 1.1224225085747928, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 21620 + }, + { + "epoch": 0.21621, + "grad_norm": 0.9141013533787247, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 21621 + }, + { + "epoch": 0.21622, + "grad_norm": 0.8814419925223598, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 21622 + }, + { + "epoch": 0.21623, + "grad_norm": 0.8747538403440693, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 21623 + }, + { + "epoch": 0.21624, + "grad_norm": 0.9642456331289045, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 21624 + }, + { + "epoch": 0.21625, + "grad_norm": 1.0237967212268972, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 21625 + }, + { + "epoch": 0.21626, + "grad_norm": 0.8914926550164441, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 21626 + }, + { + "epoch": 0.21627, + "grad_norm": 0.8911546461704158, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 21627 + }, + { + "epoch": 0.21628, + "grad_norm": 0.8773831700484963, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 21628 + }, + { + "epoch": 0.21629, + "grad_norm": 0.7283344669390761, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 21629 + }, + { + "epoch": 0.2163, + "grad_norm": 0.6710599648361194, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 21630 + }, + { + "epoch": 0.21631, + "grad_norm": 0.6924432102416787, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 21631 + }, + { + "epoch": 0.21632, + "grad_norm": 0.7931314253636245, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 21632 + }, + { + "epoch": 0.21633, + "grad_norm": 1.0012851697138478, + "learning_rate": 0.003, + "loss": 4.076, + "step": 21633 + }, + { + "epoch": 0.21634, + "grad_norm": 1.1844095005818749, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 21634 + }, + { + "epoch": 0.21635, + "grad_norm": 0.8939170965391628, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 21635 + }, + { + "epoch": 0.21636, + "grad_norm": 0.8386339167110952, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 21636 + }, + { + "epoch": 0.21637, + "grad_norm": 0.9406366770347894, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 21637 + }, + { + "epoch": 0.21638, + "grad_norm": 1.0056696953763926, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 21638 + }, + { + "epoch": 0.21639, + "grad_norm": 1.1459665315558933, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 21639 + }, + { + "epoch": 0.2164, + "grad_norm": 1.2879038591592724, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 21640 + }, + { + "epoch": 0.21641, + "grad_norm": 0.7754210785300859, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 21641 + }, + { + "epoch": 0.21642, + "grad_norm": 0.7576668442682193, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 21642 + }, + { + "epoch": 0.21643, + "grad_norm": 0.8291307354555252, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 21643 + }, + { + "epoch": 0.21644, + "grad_norm": 0.9440496609742046, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 21644 + }, + { + "epoch": 0.21645, + "grad_norm": 1.057683259501635, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 21645 + }, + { + "epoch": 0.21646, + "grad_norm": 0.965074045398963, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 21646 + }, + { + "epoch": 0.21647, + "grad_norm": 1.0388419001739948, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 21647 + }, + { + "epoch": 0.21648, + "grad_norm": 0.827878802746287, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 21648 + }, + { + "epoch": 0.21649, + "grad_norm": 0.7772531968996987, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 21649 + }, + { + "epoch": 0.2165, + "grad_norm": 0.9006617238650141, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 21650 + }, + { + "epoch": 0.21651, + "grad_norm": 1.1326992391189854, + "learning_rate": 0.003, + "loss": 4.068, + "step": 21651 + }, + { + "epoch": 0.21652, + "grad_norm": 0.927373925624323, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 21652 + }, + { + "epoch": 0.21653, + "grad_norm": 0.8504422081880315, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 21653 + }, + { + "epoch": 0.21654, + "grad_norm": 0.9677334091580314, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 21654 + }, + { + "epoch": 0.21655, + "grad_norm": 1.0793538904197635, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 21655 + }, + { + "epoch": 0.21656, + "grad_norm": 0.9933384589835113, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 21656 + }, + { + "epoch": 0.21657, + "grad_norm": 1.0372905967029282, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 21657 + }, + { + "epoch": 0.21658, + "grad_norm": 0.9473393589102788, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 21658 + }, + { + "epoch": 0.21659, + "grad_norm": 0.890156533040317, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 21659 + }, + { + "epoch": 0.2166, + "grad_norm": 0.8387443019839843, + "learning_rate": 0.003, + "loss": 4.052, + "step": 21660 + }, + { + "epoch": 0.21661, + "grad_norm": 0.9150766634331974, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 21661 + }, + { + "epoch": 0.21662, + "grad_norm": 1.0447368883019383, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 21662 + }, + { + "epoch": 0.21663, + "grad_norm": 0.8275767379179872, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 21663 + }, + { + "epoch": 0.21664, + "grad_norm": 0.9026190988602113, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 21664 + }, + { + "epoch": 0.21665, + "grad_norm": 0.9404827236433632, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 21665 + }, + { + "epoch": 0.21666, + "grad_norm": 1.0903495019012468, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 21666 + }, + { + "epoch": 0.21667, + "grad_norm": 1.0306961696004193, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 21667 + }, + { + "epoch": 0.21668, + "grad_norm": 0.9359899726193733, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 21668 + }, + { + "epoch": 0.21669, + "grad_norm": 0.9787413435144297, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 21669 + }, + { + "epoch": 0.2167, + "grad_norm": 0.9075609465237993, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 21670 + }, + { + "epoch": 0.21671, + "grad_norm": 0.910603955161684, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 21671 + }, + { + "epoch": 0.21672, + "grad_norm": 0.9974863409441619, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 21672 + }, + { + "epoch": 0.21673, + "grad_norm": 1.02372915997606, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 21673 + }, + { + "epoch": 0.21674, + "grad_norm": 0.9040700085483786, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 21674 + }, + { + "epoch": 0.21675, + "grad_norm": 0.839284094566396, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 21675 + }, + { + "epoch": 0.21676, + "grad_norm": 0.9456537268387836, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 21676 + }, + { + "epoch": 0.21677, + "grad_norm": 1.081206362503851, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 21677 + }, + { + "epoch": 0.21678, + "grad_norm": 1.120256209501071, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 21678 + }, + { + "epoch": 0.21679, + "grad_norm": 0.8484967980627793, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 21679 + }, + { + "epoch": 0.2168, + "grad_norm": 0.6274580468827639, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 21680 + }, + { + "epoch": 0.21681, + "grad_norm": 0.7160161315521397, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 21681 + }, + { + "epoch": 0.21682, + "grad_norm": 0.796191190119524, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 21682 + }, + { + "epoch": 0.21683, + "grad_norm": 0.906860065669769, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 21683 + }, + { + "epoch": 0.21684, + "grad_norm": 1.2389275850507053, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 21684 + }, + { + "epoch": 0.21685, + "grad_norm": 0.8667301294930585, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 21685 + }, + { + "epoch": 0.21686, + "grad_norm": 0.6864862165747931, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 21686 + }, + { + "epoch": 0.21687, + "grad_norm": 0.6752751222202794, + "learning_rate": 0.003, + "loss": 4.026, + "step": 21687 + }, + { + "epoch": 0.21688, + "grad_norm": 0.6393432157461432, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 21688 + }, + { + "epoch": 0.21689, + "grad_norm": 0.7213172548442143, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 21689 + }, + { + "epoch": 0.2169, + "grad_norm": 0.8068614310843161, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 21690 + }, + { + "epoch": 0.21691, + "grad_norm": 0.9735663056605472, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 21691 + }, + { + "epoch": 0.21692, + "grad_norm": 1.1510242492447276, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 21692 + }, + { + "epoch": 0.21693, + "grad_norm": 0.9088102727713627, + "learning_rate": 0.003, + "loss": 4.045, + "step": 21693 + }, + { + "epoch": 0.21694, + "grad_norm": 0.8790352573993063, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 21694 + }, + { + "epoch": 0.21695, + "grad_norm": 0.7258250167528519, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 21695 + }, + { + "epoch": 0.21696, + "grad_norm": 0.7566657780636684, + "learning_rate": 0.003, + "loss": 4.054, + "step": 21696 + }, + { + "epoch": 0.21697, + "grad_norm": 0.7481652075474815, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 21697 + }, + { + "epoch": 0.21698, + "grad_norm": 0.7995079503595919, + "learning_rate": 0.003, + "loss": 4.0001, + "step": 21698 + }, + { + "epoch": 0.21699, + "grad_norm": 0.9221970398932061, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 21699 + }, + { + "epoch": 0.217, + "grad_norm": 1.089530519639171, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 21700 + }, + { + "epoch": 0.21701, + "grad_norm": 0.9849748865019732, + "learning_rate": 0.003, + "loss": 4.07, + "step": 21701 + }, + { + "epoch": 0.21702, + "grad_norm": 1.0916368914461505, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 21702 + }, + { + "epoch": 0.21703, + "grad_norm": 0.9190329057858709, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 21703 + }, + { + "epoch": 0.21704, + "grad_norm": 0.9359031951464947, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 21704 + }, + { + "epoch": 0.21705, + "grad_norm": 0.8552540974942002, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 21705 + }, + { + "epoch": 0.21706, + "grad_norm": 0.8625463549116862, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 21706 + }, + { + "epoch": 0.21707, + "grad_norm": 0.7850073924414639, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 21707 + }, + { + "epoch": 0.21708, + "grad_norm": 0.7053690556535402, + "learning_rate": 0.003, + "loss": 4.042, + "step": 21708 + }, + { + "epoch": 0.21709, + "grad_norm": 0.679455169939329, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 21709 + }, + { + "epoch": 0.2171, + "grad_norm": 0.6063083427327239, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 21710 + }, + { + "epoch": 0.21711, + "grad_norm": 0.6042870558918229, + "learning_rate": 0.003, + "loss": 4.043, + "step": 21711 + }, + { + "epoch": 0.21712, + "grad_norm": 0.5876597030766894, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 21712 + }, + { + "epoch": 0.21713, + "grad_norm": 0.7139788375727066, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 21713 + }, + { + "epoch": 0.21714, + "grad_norm": 0.8774489981343453, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 21714 + }, + { + "epoch": 0.21715, + "grad_norm": 1.0956251394978795, + "learning_rate": 0.003, + "loss": 4.051, + "step": 21715 + }, + { + "epoch": 0.21716, + "grad_norm": 1.1323142548228589, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 21716 + }, + { + "epoch": 0.21717, + "grad_norm": 0.855765529664438, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 21717 + }, + { + "epoch": 0.21718, + "grad_norm": 0.7862372621629845, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 21718 + }, + { + "epoch": 0.21719, + "grad_norm": 0.8765073174328232, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 21719 + }, + { + "epoch": 0.2172, + "grad_norm": 1.0701874298564344, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 21720 + }, + { + "epoch": 0.21721, + "grad_norm": 1.003207493277841, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 21721 + }, + { + "epoch": 0.21722, + "grad_norm": 1.0098106141529626, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 21722 + }, + { + "epoch": 0.21723, + "grad_norm": 1.2630803083497524, + "learning_rate": 0.003, + "loss": 4.062, + "step": 21723 + }, + { + "epoch": 0.21724, + "grad_norm": 0.8200706429950618, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 21724 + }, + { + "epoch": 0.21725, + "grad_norm": 0.7808013269229824, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 21725 + }, + { + "epoch": 0.21726, + "grad_norm": 0.8076121987238968, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 21726 + }, + { + "epoch": 0.21727, + "grad_norm": 0.9340666616098632, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 21727 + }, + { + "epoch": 0.21728, + "grad_norm": 0.9423194360069064, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 21728 + }, + { + "epoch": 0.21729, + "grad_norm": 1.0503006535625348, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 21729 + }, + { + "epoch": 0.2173, + "grad_norm": 0.8724642195578092, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 21730 + }, + { + "epoch": 0.21731, + "grad_norm": 0.968236584958818, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 21731 + }, + { + "epoch": 0.21732, + "grad_norm": 0.8207492422821328, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 21732 + }, + { + "epoch": 0.21733, + "grad_norm": 0.8437198040993463, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 21733 + }, + { + "epoch": 0.21734, + "grad_norm": 0.9601208917283135, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 21734 + }, + { + "epoch": 0.21735, + "grad_norm": 1.1337011536669632, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 21735 + }, + { + "epoch": 0.21736, + "grad_norm": 1.124063963359658, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 21736 + }, + { + "epoch": 0.21737, + "grad_norm": 1.0449466232202593, + "learning_rate": 0.003, + "loss": 4.039, + "step": 21737 + }, + { + "epoch": 0.21738, + "grad_norm": 1.0890919556090963, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 21738 + }, + { + "epoch": 0.21739, + "grad_norm": 0.9816031771809035, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 21739 + }, + { + "epoch": 0.2174, + "grad_norm": 0.9426841605446407, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 21740 + }, + { + "epoch": 0.21741, + "grad_norm": 0.9543811669691511, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 21741 + }, + { + "epoch": 0.21742, + "grad_norm": 0.906755483117176, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 21742 + }, + { + "epoch": 0.21743, + "grad_norm": 0.7299678634687914, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 21743 + }, + { + "epoch": 0.21744, + "grad_norm": 0.7287868992837896, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 21744 + }, + { + "epoch": 0.21745, + "grad_norm": 0.8056391306239338, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 21745 + }, + { + "epoch": 0.21746, + "grad_norm": 0.8901162736036207, + "learning_rate": 0.003, + "loss": 4.044, + "step": 21746 + }, + { + "epoch": 0.21747, + "grad_norm": 0.8438633989529243, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 21747 + }, + { + "epoch": 0.21748, + "grad_norm": 0.7606066494856811, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 21748 + }, + { + "epoch": 0.21749, + "grad_norm": 0.6944044836174365, + "learning_rate": 0.003, + "loss": 4.024, + "step": 21749 + }, + { + "epoch": 0.2175, + "grad_norm": 0.6619833367728168, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 21750 + }, + { + "epoch": 0.21751, + "grad_norm": 0.7103281809588515, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 21751 + }, + { + "epoch": 0.21752, + "grad_norm": 0.7354971169745597, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 21752 + }, + { + "epoch": 0.21753, + "grad_norm": 0.833585795845446, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 21753 + }, + { + "epoch": 0.21754, + "grad_norm": 0.8894245683733248, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 21754 + }, + { + "epoch": 0.21755, + "grad_norm": 0.8612553490408327, + "learning_rate": 0.003, + "loss": 4.069, + "step": 21755 + }, + { + "epoch": 0.21756, + "grad_norm": 0.8247254338639682, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 21756 + }, + { + "epoch": 0.21757, + "grad_norm": 0.7995534097460469, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 21757 + }, + { + "epoch": 0.21758, + "grad_norm": 0.8442317205485379, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 21758 + }, + { + "epoch": 0.21759, + "grad_norm": 0.8615741170302611, + "learning_rate": 0.003, + "loss": 4.064, + "step": 21759 + }, + { + "epoch": 0.2176, + "grad_norm": 0.8879986320591802, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 21760 + }, + { + "epoch": 0.21761, + "grad_norm": 0.9912384350380697, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 21761 + }, + { + "epoch": 0.21762, + "grad_norm": 1.1614877312854015, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 21762 + }, + { + "epoch": 0.21763, + "grad_norm": 0.9920690760233691, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 21763 + }, + { + "epoch": 0.21764, + "grad_norm": 1.2023127995063825, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 21764 + }, + { + "epoch": 0.21765, + "grad_norm": 1.1743562689636649, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 21765 + }, + { + "epoch": 0.21766, + "grad_norm": 0.8184194833657082, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 21766 + }, + { + "epoch": 0.21767, + "grad_norm": 0.6096334570290339, + "learning_rate": 0.003, + "loss": 4.06, + "step": 21767 + }, + { + "epoch": 0.21768, + "grad_norm": 0.6160540298539695, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 21768 + }, + { + "epoch": 0.21769, + "grad_norm": 0.6500015352502315, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 21769 + }, + { + "epoch": 0.2177, + "grad_norm": 0.7285631809258732, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 21770 + }, + { + "epoch": 0.21771, + "grad_norm": 0.7609430266966889, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 21771 + }, + { + "epoch": 0.21772, + "grad_norm": 0.8070217227685988, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 21772 + }, + { + "epoch": 0.21773, + "grad_norm": 1.0632512826420057, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 21773 + }, + { + "epoch": 0.21774, + "grad_norm": 1.2353220250868409, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 21774 + }, + { + "epoch": 0.21775, + "grad_norm": 0.9261179251115704, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 21775 + }, + { + "epoch": 0.21776, + "grad_norm": 0.8639712711510896, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 21776 + }, + { + "epoch": 0.21777, + "grad_norm": 0.8128299898805978, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 21777 + }, + { + "epoch": 0.21778, + "grad_norm": 0.855685022439737, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 21778 + }, + { + "epoch": 0.21779, + "grad_norm": 0.8063805750924459, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 21779 + }, + { + "epoch": 0.2178, + "grad_norm": 0.8480521484072908, + "learning_rate": 0.003, + "loss": 4.06, + "step": 21780 + }, + { + "epoch": 0.21781, + "grad_norm": 0.9660245900318699, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 21781 + }, + { + "epoch": 0.21782, + "grad_norm": 1.0618385269148125, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 21782 + }, + { + "epoch": 0.21783, + "grad_norm": 1.0031395180549605, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 21783 + }, + { + "epoch": 0.21784, + "grad_norm": 0.9996017023202192, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 21784 + }, + { + "epoch": 0.21785, + "grad_norm": 1.0830149435163006, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 21785 + }, + { + "epoch": 0.21786, + "grad_norm": 0.8483612642835753, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 21786 + }, + { + "epoch": 0.21787, + "grad_norm": 0.7176720433588569, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 21787 + }, + { + "epoch": 0.21788, + "grad_norm": 0.76656946742843, + "learning_rate": 0.003, + "loss": 4.067, + "step": 21788 + }, + { + "epoch": 0.21789, + "grad_norm": 0.7348930326566943, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 21789 + }, + { + "epoch": 0.2179, + "grad_norm": 0.8178763061238735, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 21790 + }, + { + "epoch": 0.21791, + "grad_norm": 0.8432369152314106, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 21791 + }, + { + "epoch": 0.21792, + "grad_norm": 0.9160346440476382, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 21792 + }, + { + "epoch": 0.21793, + "grad_norm": 1.0035829820589213, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 21793 + }, + { + "epoch": 0.21794, + "grad_norm": 0.9941020274645616, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 21794 + }, + { + "epoch": 0.21795, + "grad_norm": 0.923841291635766, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 21795 + }, + { + "epoch": 0.21796, + "grad_norm": 1.0241090625911111, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 21796 + }, + { + "epoch": 0.21797, + "grad_norm": 1.1407388137491221, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 21797 + }, + { + "epoch": 0.21798, + "grad_norm": 0.8610989870612316, + "learning_rate": 0.003, + "loss": 4.058, + "step": 21798 + }, + { + "epoch": 0.21799, + "grad_norm": 0.8336347491408379, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 21799 + }, + { + "epoch": 0.218, + "grad_norm": 1.0339584357931486, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 21800 + }, + { + "epoch": 0.21801, + "grad_norm": 1.2544756120362408, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 21801 + }, + { + "epoch": 0.21802, + "grad_norm": 0.8172775587518837, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 21802 + }, + { + "epoch": 0.21803, + "grad_norm": 0.8331448364257094, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 21803 + }, + { + "epoch": 0.21804, + "grad_norm": 0.9073667972995151, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 21804 + }, + { + "epoch": 0.21805, + "grad_norm": 1.010912740947555, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 21805 + }, + { + "epoch": 0.21806, + "grad_norm": 1.0703870318792599, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 21806 + }, + { + "epoch": 0.21807, + "grad_norm": 0.9747586919888545, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 21807 + }, + { + "epoch": 0.21808, + "grad_norm": 0.8793745077518524, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 21808 + }, + { + "epoch": 0.21809, + "grad_norm": 0.8539302737976954, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 21809 + }, + { + "epoch": 0.2181, + "grad_norm": 0.8646277883648703, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 21810 + }, + { + "epoch": 0.21811, + "grad_norm": 0.8623922088132618, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 21811 + }, + { + "epoch": 0.21812, + "grad_norm": 0.9474532195929734, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 21812 + }, + { + "epoch": 0.21813, + "grad_norm": 0.9639535168663033, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 21813 + }, + { + "epoch": 0.21814, + "grad_norm": 0.9138906912409014, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 21814 + }, + { + "epoch": 0.21815, + "grad_norm": 1.119352753631545, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 21815 + }, + { + "epoch": 0.21816, + "grad_norm": 1.0997231232692022, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 21816 + }, + { + "epoch": 0.21817, + "grad_norm": 1.0971161967582554, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 21817 + }, + { + "epoch": 0.21818, + "grad_norm": 1.11049528237168, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 21818 + }, + { + "epoch": 0.21819, + "grad_norm": 0.8038519007683086, + "learning_rate": 0.003, + "loss": 4.057, + "step": 21819 + }, + { + "epoch": 0.2182, + "grad_norm": 0.8106520088920963, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 21820 + }, + { + "epoch": 0.21821, + "grad_norm": 0.8506136642300416, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 21821 + }, + { + "epoch": 0.21822, + "grad_norm": 0.9479386857753651, + "learning_rate": 0.003, + "loss": 4.026, + "step": 21822 + }, + { + "epoch": 0.21823, + "grad_norm": 0.9578053700772413, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 21823 + }, + { + "epoch": 0.21824, + "grad_norm": 0.868717582079508, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 21824 + }, + { + "epoch": 0.21825, + "grad_norm": 0.7912575729476037, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 21825 + }, + { + "epoch": 0.21826, + "grad_norm": 0.7311329522364901, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 21826 + }, + { + "epoch": 0.21827, + "grad_norm": 0.7262920795960275, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 21827 + }, + { + "epoch": 0.21828, + "grad_norm": 0.7514414802070677, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 21828 + }, + { + "epoch": 0.21829, + "grad_norm": 0.8752341715451432, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 21829 + }, + { + "epoch": 0.2183, + "grad_norm": 1.0182278948983072, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 21830 + }, + { + "epoch": 0.21831, + "grad_norm": 1.047232178453799, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 21831 + }, + { + "epoch": 0.21832, + "grad_norm": 1.142938001776131, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 21832 + }, + { + "epoch": 0.21833, + "grad_norm": 0.8663250237993643, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 21833 + }, + { + "epoch": 0.21834, + "grad_norm": 0.7484786077650789, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 21834 + }, + { + "epoch": 0.21835, + "grad_norm": 0.7674261991974324, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 21835 + }, + { + "epoch": 0.21836, + "grad_norm": 0.8879037233906544, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 21836 + }, + { + "epoch": 0.21837, + "grad_norm": 0.9446985187136123, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 21837 + }, + { + "epoch": 0.21838, + "grad_norm": 0.9384609787373933, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 21838 + }, + { + "epoch": 0.21839, + "grad_norm": 0.9770163781229995, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 21839 + }, + { + "epoch": 0.2184, + "grad_norm": 0.9148885808763234, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 21840 + }, + { + "epoch": 0.21841, + "grad_norm": 0.6592954238743538, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 21841 + }, + { + "epoch": 0.21842, + "grad_norm": 0.6663298622239738, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 21842 + }, + { + "epoch": 0.21843, + "grad_norm": 0.8771462388033658, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 21843 + }, + { + "epoch": 0.21844, + "grad_norm": 1.0706011980772319, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 21844 + }, + { + "epoch": 0.21845, + "grad_norm": 0.9354602263488214, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 21845 + }, + { + "epoch": 0.21846, + "grad_norm": 0.9344656532962532, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 21846 + }, + { + "epoch": 0.21847, + "grad_norm": 1.0298846182157835, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 21847 + }, + { + "epoch": 0.21848, + "grad_norm": 0.9446983450757662, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 21848 + }, + { + "epoch": 0.21849, + "grad_norm": 1.0034713206357542, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 21849 + }, + { + "epoch": 0.2185, + "grad_norm": 0.8707010208825356, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 21850 + }, + { + "epoch": 0.21851, + "grad_norm": 0.8716262999612676, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 21851 + }, + { + "epoch": 0.21852, + "grad_norm": 0.8397497111919408, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 21852 + }, + { + "epoch": 0.21853, + "grad_norm": 0.9372735724190617, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 21853 + }, + { + "epoch": 0.21854, + "grad_norm": 0.9295463004752186, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 21854 + }, + { + "epoch": 0.21855, + "grad_norm": 0.961755397950133, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 21855 + }, + { + "epoch": 0.21856, + "grad_norm": 1.015196190236595, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 21856 + }, + { + "epoch": 0.21857, + "grad_norm": 0.9869083536078628, + "learning_rate": 0.003, + "loss": 4.045, + "step": 21857 + }, + { + "epoch": 0.21858, + "grad_norm": 0.8937443116883966, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 21858 + }, + { + "epoch": 0.21859, + "grad_norm": 0.7909144463504884, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 21859 + }, + { + "epoch": 0.2186, + "grad_norm": 0.856811950215714, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 21860 + }, + { + "epoch": 0.21861, + "grad_norm": 0.9530697922161849, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 21861 + }, + { + "epoch": 0.21862, + "grad_norm": 1.042586908502439, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 21862 + }, + { + "epoch": 0.21863, + "grad_norm": 0.8531969681684066, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 21863 + }, + { + "epoch": 0.21864, + "grad_norm": 0.9731209413345449, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 21864 + }, + { + "epoch": 0.21865, + "grad_norm": 1.1988167292908487, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 21865 + }, + { + "epoch": 0.21866, + "grad_norm": 0.8389270945775307, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 21866 + }, + { + "epoch": 0.21867, + "grad_norm": 0.7455358541275876, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 21867 + }, + { + "epoch": 0.21868, + "grad_norm": 0.8170531807435585, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 21868 + }, + { + "epoch": 0.21869, + "grad_norm": 0.8094350063446712, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 21869 + }, + { + "epoch": 0.2187, + "grad_norm": 0.8664323643634094, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 21870 + }, + { + "epoch": 0.21871, + "grad_norm": 0.9823236756909407, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 21871 + }, + { + "epoch": 0.21872, + "grad_norm": 1.2424030025414625, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 21872 + }, + { + "epoch": 0.21873, + "grad_norm": 1.0055262272789798, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 21873 + }, + { + "epoch": 0.21874, + "grad_norm": 1.1366836626416323, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 21874 + }, + { + "epoch": 0.21875, + "grad_norm": 0.9011876189089401, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 21875 + }, + { + "epoch": 0.21876, + "grad_norm": 0.812194119712535, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 21876 + }, + { + "epoch": 0.21877, + "grad_norm": 0.8382600713152306, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 21877 + }, + { + "epoch": 0.21878, + "grad_norm": 0.7563550271046633, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 21878 + }, + { + "epoch": 0.21879, + "grad_norm": 0.6657831446827873, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 21879 + }, + { + "epoch": 0.2188, + "grad_norm": 0.7303940167997048, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 21880 + }, + { + "epoch": 0.21881, + "grad_norm": 0.7996671931555955, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 21881 + }, + { + "epoch": 0.21882, + "grad_norm": 0.9485744360751281, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 21882 + }, + { + "epoch": 0.21883, + "grad_norm": 1.0743253420193146, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 21883 + }, + { + "epoch": 0.21884, + "grad_norm": 1.121471550017392, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 21884 + }, + { + "epoch": 0.21885, + "grad_norm": 0.7206118261295444, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 21885 + }, + { + "epoch": 0.21886, + "grad_norm": 0.6813490256850663, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 21886 + }, + { + "epoch": 0.21887, + "grad_norm": 0.778450787755007, + "learning_rate": 0.003, + "loss": 4.03, + "step": 21887 + }, + { + "epoch": 0.21888, + "grad_norm": 0.8373205004683666, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 21888 + }, + { + "epoch": 0.21889, + "grad_norm": 0.8920749430057973, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 21889 + }, + { + "epoch": 0.2189, + "grad_norm": 0.8724784267186376, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 21890 + }, + { + "epoch": 0.21891, + "grad_norm": 0.7083332709884621, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 21891 + }, + { + "epoch": 0.21892, + "grad_norm": 0.647214062500226, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 21892 + }, + { + "epoch": 0.21893, + "grad_norm": 0.7844954984467796, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 21893 + }, + { + "epoch": 0.21894, + "grad_norm": 0.8793651469910239, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 21894 + }, + { + "epoch": 0.21895, + "grad_norm": 1.0256973846512143, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 21895 + }, + { + "epoch": 0.21896, + "grad_norm": 1.0649812626595685, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 21896 + }, + { + "epoch": 0.21897, + "grad_norm": 1.012867883109725, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 21897 + }, + { + "epoch": 0.21898, + "grad_norm": 0.954053886164319, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 21898 + }, + { + "epoch": 0.21899, + "grad_norm": 0.9521962465100751, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 21899 + }, + { + "epoch": 0.219, + "grad_norm": 0.9700752620442195, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 21900 + }, + { + "epoch": 0.21901, + "grad_norm": 0.8369189325197408, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 21901 + }, + { + "epoch": 0.21902, + "grad_norm": 0.6738877534835154, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 21902 + }, + { + "epoch": 0.21903, + "grad_norm": 0.6705390718182693, + "learning_rate": 0.003, + "loss": 4.049, + "step": 21903 + }, + { + "epoch": 0.21904, + "grad_norm": 0.7907706610014479, + "learning_rate": 0.003, + "loss": 4.036, + "step": 21904 + }, + { + "epoch": 0.21905, + "grad_norm": 0.9056948905677695, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 21905 + }, + { + "epoch": 0.21906, + "grad_norm": 1.075751867066345, + "learning_rate": 0.003, + "loss": 4.05, + "step": 21906 + }, + { + "epoch": 0.21907, + "grad_norm": 1.1494999490389362, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 21907 + }, + { + "epoch": 0.21908, + "grad_norm": 0.777028480084331, + "learning_rate": 0.003, + "loss": 4.054, + "step": 21908 + }, + { + "epoch": 0.21909, + "grad_norm": 0.6772436702913768, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 21909 + }, + { + "epoch": 0.2191, + "grad_norm": 0.6159040864506407, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 21910 + }, + { + "epoch": 0.21911, + "grad_norm": 0.5689982736070225, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 21911 + }, + { + "epoch": 0.21912, + "grad_norm": 0.6051979480078344, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 21912 + }, + { + "epoch": 0.21913, + "grad_norm": 0.6291469820395099, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 21913 + }, + { + "epoch": 0.21914, + "grad_norm": 0.7460128617886833, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 21914 + }, + { + "epoch": 0.21915, + "grad_norm": 0.9132170400530574, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 21915 + }, + { + "epoch": 0.21916, + "grad_norm": 1.0683635178805837, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 21916 + }, + { + "epoch": 0.21917, + "grad_norm": 0.8503349283094361, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 21917 + }, + { + "epoch": 0.21918, + "grad_norm": 0.7741921512919092, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 21918 + }, + { + "epoch": 0.21919, + "grad_norm": 0.9007887805894577, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 21919 + }, + { + "epoch": 0.2192, + "grad_norm": 1.1315815647411827, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 21920 + }, + { + "epoch": 0.21921, + "grad_norm": 0.900731775488954, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 21921 + }, + { + "epoch": 0.21922, + "grad_norm": 0.8402525578283325, + "learning_rate": 0.003, + "loss": 4.034, + "step": 21922 + }, + { + "epoch": 0.21923, + "grad_norm": 0.8020071535486587, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 21923 + }, + { + "epoch": 0.21924, + "grad_norm": 0.7662828007501628, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 21924 + }, + { + "epoch": 0.21925, + "grad_norm": 0.7963601304778837, + "learning_rate": 0.003, + "loss": 4.068, + "step": 21925 + }, + { + "epoch": 0.21926, + "grad_norm": 0.8035038686789826, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 21926 + }, + { + "epoch": 0.21927, + "grad_norm": 0.8361149920712347, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 21927 + }, + { + "epoch": 0.21928, + "grad_norm": 0.9546495384962267, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 21928 + }, + { + "epoch": 0.21929, + "grad_norm": 1.042465837267675, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 21929 + }, + { + "epoch": 0.2193, + "grad_norm": 0.9732944679204636, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 21930 + }, + { + "epoch": 0.21931, + "grad_norm": 1.2138280279745988, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 21931 + }, + { + "epoch": 0.21932, + "grad_norm": 0.8908117721549045, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 21932 + }, + { + "epoch": 0.21933, + "grad_norm": 0.8535564550124914, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 21933 + }, + { + "epoch": 0.21934, + "grad_norm": 0.9114577820581846, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 21934 + }, + { + "epoch": 0.21935, + "grad_norm": 0.9080055072066445, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 21935 + }, + { + "epoch": 0.21936, + "grad_norm": 1.0330105377112426, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 21936 + }, + { + "epoch": 0.21937, + "grad_norm": 0.9198481958564433, + "learning_rate": 0.003, + "loss": 4.038, + "step": 21937 + }, + { + "epoch": 0.21938, + "grad_norm": 1.0156252206314884, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 21938 + }, + { + "epoch": 0.21939, + "grad_norm": 1.2279596447696417, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 21939 + }, + { + "epoch": 0.2194, + "grad_norm": 0.9494450338245576, + "learning_rate": 0.003, + "loss": 4.064, + "step": 21940 + }, + { + "epoch": 0.21941, + "grad_norm": 0.8397723643997518, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 21941 + }, + { + "epoch": 0.21942, + "grad_norm": 0.8197689727260375, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 21942 + }, + { + "epoch": 0.21943, + "grad_norm": 0.8822972084098939, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 21943 + }, + { + "epoch": 0.21944, + "grad_norm": 0.913576643297579, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 21944 + }, + { + "epoch": 0.21945, + "grad_norm": 0.9681929262240375, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 21945 + }, + { + "epoch": 0.21946, + "grad_norm": 1.1276531714558127, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 21946 + }, + { + "epoch": 0.21947, + "grad_norm": 0.9563917469483959, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 21947 + }, + { + "epoch": 0.21948, + "grad_norm": 0.9297387134990617, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 21948 + }, + { + "epoch": 0.21949, + "grad_norm": 1.0030089167553753, + "learning_rate": 0.003, + "loss": 4.067, + "step": 21949 + }, + { + "epoch": 0.2195, + "grad_norm": 1.0260459078655029, + "learning_rate": 0.003, + "loss": 4.035, + "step": 21950 + }, + { + "epoch": 0.21951, + "grad_norm": 1.0261079793891301, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 21951 + }, + { + "epoch": 0.21952, + "grad_norm": 0.8150090489494513, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 21952 + }, + { + "epoch": 0.21953, + "grad_norm": 0.8330262705530509, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 21953 + }, + { + "epoch": 0.21954, + "grad_norm": 0.8877797175894346, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 21954 + }, + { + "epoch": 0.21955, + "grad_norm": 0.9163156116981159, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 21955 + }, + { + "epoch": 0.21956, + "grad_norm": 0.9117591523794207, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 21956 + }, + { + "epoch": 0.21957, + "grad_norm": 1.0061720500626825, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 21957 + }, + { + "epoch": 0.21958, + "grad_norm": 0.9336551832191011, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 21958 + }, + { + "epoch": 0.21959, + "grad_norm": 0.8638015974782018, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 21959 + }, + { + "epoch": 0.2196, + "grad_norm": 0.9258178951315998, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 21960 + }, + { + "epoch": 0.21961, + "grad_norm": 1.0095917318366774, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 21961 + }, + { + "epoch": 0.21962, + "grad_norm": 1.130325315875703, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 21962 + }, + { + "epoch": 0.21963, + "grad_norm": 0.9432677619169314, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 21963 + }, + { + "epoch": 0.21964, + "grad_norm": 0.9876202238780849, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 21964 + }, + { + "epoch": 0.21965, + "grad_norm": 0.9092715519423531, + "learning_rate": 0.003, + "loss": 4.039, + "step": 21965 + }, + { + "epoch": 0.21966, + "grad_norm": 0.7823542691538469, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 21966 + }, + { + "epoch": 0.21967, + "grad_norm": 0.7548664243000878, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 21967 + }, + { + "epoch": 0.21968, + "grad_norm": 0.8114571919490169, + "learning_rate": 0.003, + "loss": 4.04, + "step": 21968 + }, + { + "epoch": 0.21969, + "grad_norm": 0.8696570578924755, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 21969 + }, + { + "epoch": 0.2197, + "grad_norm": 1.0713546304019836, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 21970 + }, + { + "epoch": 0.21971, + "grad_norm": 1.0442277070403008, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 21971 + }, + { + "epoch": 0.21972, + "grad_norm": 1.0606164479372318, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 21972 + }, + { + "epoch": 0.21973, + "grad_norm": 0.9968035429856121, + "learning_rate": 0.003, + "loss": 4.07, + "step": 21973 + }, + { + "epoch": 0.21974, + "grad_norm": 0.9395825453993736, + "learning_rate": 0.003, + "loss": 4.093, + "step": 21974 + }, + { + "epoch": 0.21975, + "grad_norm": 0.8339717029443773, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 21975 + }, + { + "epoch": 0.21976, + "grad_norm": 0.7656181648361848, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 21976 + }, + { + "epoch": 0.21977, + "grad_norm": 0.7258709980315018, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 21977 + }, + { + "epoch": 0.21978, + "grad_norm": 0.682290123584367, + "learning_rate": 0.003, + "loss": 4.083, + "step": 21978 + }, + { + "epoch": 0.21979, + "grad_norm": 0.8346360893070389, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 21979 + }, + { + "epoch": 0.2198, + "grad_norm": 0.9460559704034633, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 21980 + }, + { + "epoch": 0.21981, + "grad_norm": 0.8432767121223144, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 21981 + }, + { + "epoch": 0.21982, + "grad_norm": 0.8901654019857779, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 21982 + }, + { + "epoch": 0.21983, + "grad_norm": 1.0870122988568107, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 21983 + }, + { + "epoch": 0.21984, + "grad_norm": 0.9472534506491445, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 21984 + }, + { + "epoch": 0.21985, + "grad_norm": 0.8072303785690024, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 21985 + }, + { + "epoch": 0.21986, + "grad_norm": 0.7499471652723284, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 21986 + }, + { + "epoch": 0.21987, + "grad_norm": 0.6842165837969477, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 21987 + }, + { + "epoch": 0.21988, + "grad_norm": 0.6925023766928035, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 21988 + }, + { + "epoch": 0.21989, + "grad_norm": 0.8514035080316976, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 21989 + }, + { + "epoch": 0.2199, + "grad_norm": 0.9955773677075328, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 21990 + }, + { + "epoch": 0.21991, + "grad_norm": 1.0341450259368192, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 21991 + }, + { + "epoch": 0.21992, + "grad_norm": 0.9267237665318736, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 21992 + }, + { + "epoch": 0.21993, + "grad_norm": 0.9514563001539492, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 21993 + }, + { + "epoch": 0.21994, + "grad_norm": 1.0475667411585288, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 21994 + }, + { + "epoch": 0.21995, + "grad_norm": 0.9340612817076597, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 21995 + }, + { + "epoch": 0.21996, + "grad_norm": 0.9053888369308316, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 21996 + }, + { + "epoch": 0.21997, + "grad_norm": 0.9854427296908433, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 21997 + }, + { + "epoch": 0.21998, + "grad_norm": 1.1047818368556792, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 21998 + }, + { + "epoch": 0.21999, + "grad_norm": 0.8928894110392157, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 21999 + }, + { + "epoch": 0.22, + "grad_norm": 0.7950769883951311, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 22000 + }, + { + "epoch": 0.22001, + "grad_norm": 0.734494180822481, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 22001 + }, + { + "epoch": 0.22002, + "grad_norm": 0.6876677544192328, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 22002 + }, + { + "epoch": 0.22003, + "grad_norm": 0.6834251727318637, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 22003 + }, + { + "epoch": 0.22004, + "grad_norm": 0.621448935106141, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 22004 + }, + { + "epoch": 0.22005, + "grad_norm": 0.620067830057128, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 22005 + }, + { + "epoch": 0.22006, + "grad_norm": 0.6880286765366899, + "learning_rate": 0.003, + "loss": 4.047, + "step": 22006 + }, + { + "epoch": 0.22007, + "grad_norm": 0.6762293899215118, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 22007 + }, + { + "epoch": 0.22008, + "grad_norm": 0.6906504675381372, + "learning_rate": 0.003, + "loss": 4.027, + "step": 22008 + }, + { + "epoch": 0.22009, + "grad_norm": 0.8750565593766731, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 22009 + }, + { + "epoch": 0.2201, + "grad_norm": 1.1111229562303055, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 22010 + }, + { + "epoch": 0.22011, + "grad_norm": 0.8122356668206744, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 22011 + }, + { + "epoch": 0.22012, + "grad_norm": 0.6421844897216124, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 22012 + }, + { + "epoch": 0.22013, + "grad_norm": 0.6393545101726368, + "learning_rate": 0.003, + "loss": 4.021, + "step": 22013 + }, + { + "epoch": 0.22014, + "grad_norm": 0.698672663466228, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 22014 + }, + { + "epoch": 0.22015, + "grad_norm": 0.8117114166218956, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 22015 + }, + { + "epoch": 0.22016, + "grad_norm": 1.0789685871338646, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 22016 + }, + { + "epoch": 0.22017, + "grad_norm": 1.1635471219847546, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 22017 + }, + { + "epoch": 0.22018, + "grad_norm": 0.8894181980682901, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 22018 + }, + { + "epoch": 0.22019, + "grad_norm": 0.7599329229872507, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 22019 + }, + { + "epoch": 0.2202, + "grad_norm": 0.8138520555366555, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 22020 + }, + { + "epoch": 0.22021, + "grad_norm": 0.9413706647623215, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 22021 + }, + { + "epoch": 0.22022, + "grad_norm": 0.9956293965437711, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 22022 + }, + { + "epoch": 0.22023, + "grad_norm": 0.872226871472913, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 22023 + }, + { + "epoch": 0.22024, + "grad_norm": 0.8392197383485834, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 22024 + }, + { + "epoch": 0.22025, + "grad_norm": 0.821763293132257, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 22025 + }, + { + "epoch": 0.22026, + "grad_norm": 0.9739707963064859, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 22026 + }, + { + "epoch": 0.22027, + "grad_norm": 1.3005476925099593, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 22027 + }, + { + "epoch": 0.22028, + "grad_norm": 0.8544054111562639, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 22028 + }, + { + "epoch": 0.22029, + "grad_norm": 0.8524055747795404, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 22029 + }, + { + "epoch": 0.2203, + "grad_norm": 0.8159263780645369, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 22030 + }, + { + "epoch": 0.22031, + "grad_norm": 0.8885496352845809, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 22031 + }, + { + "epoch": 0.22032, + "grad_norm": 1.019125991831821, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 22032 + }, + { + "epoch": 0.22033, + "grad_norm": 1.0433179950163909, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 22033 + }, + { + "epoch": 0.22034, + "grad_norm": 1.1013315508093695, + "learning_rate": 0.003, + "loss": 4.066, + "step": 22034 + }, + { + "epoch": 0.22035, + "grad_norm": 1.217801173080061, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 22035 + }, + { + "epoch": 0.22036, + "grad_norm": 0.9719025062139389, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 22036 + }, + { + "epoch": 0.22037, + "grad_norm": 0.9460145715715301, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 22037 + }, + { + "epoch": 0.22038, + "grad_norm": 0.9553322724925778, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 22038 + }, + { + "epoch": 0.22039, + "grad_norm": 0.8793063614064192, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 22039 + }, + { + "epoch": 0.2204, + "grad_norm": 0.7581261730545026, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 22040 + }, + { + "epoch": 0.22041, + "grad_norm": 0.7483301073810726, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 22041 + }, + { + "epoch": 0.22042, + "grad_norm": 0.8856112184457362, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 22042 + }, + { + "epoch": 0.22043, + "grad_norm": 1.2593560711629388, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 22043 + }, + { + "epoch": 0.22044, + "grad_norm": 0.8008771904157289, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 22044 + }, + { + "epoch": 0.22045, + "grad_norm": 0.7384000284456744, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 22045 + }, + { + "epoch": 0.22046, + "grad_norm": 0.7890036037153679, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 22046 + }, + { + "epoch": 0.22047, + "grad_norm": 0.9421431918982789, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 22047 + }, + { + "epoch": 0.22048, + "grad_norm": 1.025099617930658, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 22048 + }, + { + "epoch": 0.22049, + "grad_norm": 1.1616626707805855, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 22049 + }, + { + "epoch": 0.2205, + "grad_norm": 0.7228352275682283, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 22050 + }, + { + "epoch": 0.22051, + "grad_norm": 0.711001669386256, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 22051 + }, + { + "epoch": 0.22052, + "grad_norm": 0.7176328414632288, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 22052 + }, + { + "epoch": 0.22053, + "grad_norm": 0.59507105959698, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 22053 + }, + { + "epoch": 0.22054, + "grad_norm": 0.6946647283928828, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 22054 + }, + { + "epoch": 0.22055, + "grad_norm": 0.7301014932964457, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 22055 + }, + { + "epoch": 0.22056, + "grad_norm": 0.8459781177901133, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 22056 + }, + { + "epoch": 0.22057, + "grad_norm": 1.0307913515441087, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 22057 + }, + { + "epoch": 0.22058, + "grad_norm": 1.1661211277669492, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 22058 + }, + { + "epoch": 0.22059, + "grad_norm": 0.8209930959787649, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 22059 + }, + { + "epoch": 0.2206, + "grad_norm": 1.0499763903556976, + "learning_rate": 0.003, + "loss": 4.073, + "step": 22060 + }, + { + "epoch": 0.22061, + "grad_norm": 1.323809242615842, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 22061 + }, + { + "epoch": 0.22062, + "grad_norm": 0.7677855366310005, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 22062 + }, + { + "epoch": 0.22063, + "grad_norm": 0.6958455279405271, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 22063 + }, + { + "epoch": 0.22064, + "grad_norm": 0.7783873526395044, + "learning_rate": 0.003, + "loss": 4.067, + "step": 22064 + }, + { + "epoch": 0.22065, + "grad_norm": 0.7922877051007191, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 22065 + }, + { + "epoch": 0.22066, + "grad_norm": 0.8265533332452641, + "learning_rate": 0.003, + "loss": 4.024, + "step": 22066 + }, + { + "epoch": 0.22067, + "grad_norm": 0.8544521811434357, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 22067 + }, + { + "epoch": 0.22068, + "grad_norm": 0.7763784129242721, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 22068 + }, + { + "epoch": 0.22069, + "grad_norm": 0.7567372744531591, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 22069 + }, + { + "epoch": 0.2207, + "grad_norm": 0.941578178646652, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 22070 + }, + { + "epoch": 0.22071, + "grad_norm": 1.3653916734084433, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 22071 + }, + { + "epoch": 0.22072, + "grad_norm": 0.6764121067076573, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 22072 + }, + { + "epoch": 0.22073, + "grad_norm": 0.6822005871998648, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 22073 + }, + { + "epoch": 0.22074, + "grad_norm": 0.7734727632237398, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 22074 + }, + { + "epoch": 0.22075, + "grad_norm": 0.8263022034619842, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 22075 + }, + { + "epoch": 0.22076, + "grad_norm": 0.8555431548382049, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 22076 + }, + { + "epoch": 0.22077, + "grad_norm": 0.9943960835641048, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 22077 + }, + { + "epoch": 0.22078, + "grad_norm": 1.3209523536699566, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 22078 + }, + { + "epoch": 0.22079, + "grad_norm": 0.8628817290893656, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 22079 + }, + { + "epoch": 0.2208, + "grad_norm": 0.8026017998921515, + "learning_rate": 0.003, + "loss": 4.058, + "step": 22080 + }, + { + "epoch": 0.22081, + "grad_norm": 0.8013839472692773, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 22081 + }, + { + "epoch": 0.22082, + "grad_norm": 0.8016469163570976, + "learning_rate": 0.003, + "loss": 4.062, + "step": 22082 + }, + { + "epoch": 0.22083, + "grad_norm": 0.7982184060808496, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 22083 + }, + { + "epoch": 0.22084, + "grad_norm": 0.9097833295102461, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 22084 + }, + { + "epoch": 0.22085, + "grad_norm": 0.9517237446547862, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 22085 + }, + { + "epoch": 0.22086, + "grad_norm": 1.0815212225852908, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 22086 + }, + { + "epoch": 0.22087, + "grad_norm": 1.0950994509268546, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 22087 + }, + { + "epoch": 0.22088, + "grad_norm": 1.152165618701013, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 22088 + }, + { + "epoch": 0.22089, + "grad_norm": 0.9979512055191654, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 22089 + }, + { + "epoch": 0.2209, + "grad_norm": 1.0725087493276717, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 22090 + }, + { + "epoch": 0.22091, + "grad_norm": 0.9584336254451938, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 22091 + }, + { + "epoch": 0.22092, + "grad_norm": 0.8307520421776294, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 22092 + }, + { + "epoch": 0.22093, + "grad_norm": 0.7027488547501028, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 22093 + }, + { + "epoch": 0.22094, + "grad_norm": 0.6915087178386995, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 22094 + }, + { + "epoch": 0.22095, + "grad_norm": 0.6957966279279953, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 22095 + }, + { + "epoch": 0.22096, + "grad_norm": 0.6528400653651122, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 22096 + }, + { + "epoch": 0.22097, + "grad_norm": 0.6201580723620347, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 22097 + }, + { + "epoch": 0.22098, + "grad_norm": 0.648348209377725, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 22098 + }, + { + "epoch": 0.22099, + "grad_norm": 0.6883313940083414, + "learning_rate": 0.003, + "loss": 4.023, + "step": 22099 + }, + { + "epoch": 0.221, + "grad_norm": 0.7528538106866004, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 22100 + }, + { + "epoch": 0.22101, + "grad_norm": 1.0352249303340857, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 22101 + }, + { + "epoch": 0.22102, + "grad_norm": 1.2794084290109426, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 22102 + }, + { + "epoch": 0.22103, + "grad_norm": 0.8448328340651118, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 22103 + }, + { + "epoch": 0.22104, + "grad_norm": 0.8392210845201308, + "learning_rate": 0.003, + "loss": 4.074, + "step": 22104 + }, + { + "epoch": 0.22105, + "grad_norm": 0.9451307645110609, + "learning_rate": 0.003, + "loss": 4.048, + "step": 22105 + }, + { + "epoch": 0.22106, + "grad_norm": 1.2340804605629108, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 22106 + }, + { + "epoch": 0.22107, + "grad_norm": 0.9491584976803158, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 22107 + }, + { + "epoch": 0.22108, + "grad_norm": 0.8646855903049168, + "learning_rate": 0.003, + "loss": 4.032, + "step": 22108 + }, + { + "epoch": 0.22109, + "grad_norm": 0.875144157854956, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 22109 + }, + { + "epoch": 0.2211, + "grad_norm": 1.0337424841785552, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 22110 + }, + { + "epoch": 0.22111, + "grad_norm": 1.1641434240189423, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 22111 + }, + { + "epoch": 0.22112, + "grad_norm": 0.7755006738748029, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 22112 + }, + { + "epoch": 0.22113, + "grad_norm": 0.8580274284258331, + "learning_rate": 0.003, + "loss": 4.067, + "step": 22113 + }, + { + "epoch": 0.22114, + "grad_norm": 0.815040159252193, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 22114 + }, + { + "epoch": 0.22115, + "grad_norm": 0.8608390652691527, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 22115 + }, + { + "epoch": 0.22116, + "grad_norm": 0.8741338655947701, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 22116 + }, + { + "epoch": 0.22117, + "grad_norm": 1.0332256061894953, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 22117 + }, + { + "epoch": 0.22118, + "grad_norm": 0.9913553858784968, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 22118 + }, + { + "epoch": 0.22119, + "grad_norm": 0.8952093848739033, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 22119 + }, + { + "epoch": 0.2212, + "grad_norm": 0.8323849163935308, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 22120 + }, + { + "epoch": 0.22121, + "grad_norm": 0.8896918263188238, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 22121 + }, + { + "epoch": 0.22122, + "grad_norm": 0.8380957319175341, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 22122 + }, + { + "epoch": 0.22123, + "grad_norm": 0.8977118166031717, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 22123 + }, + { + "epoch": 0.22124, + "grad_norm": 0.929525155158042, + "learning_rate": 0.003, + "loss": 4.049, + "step": 22124 + }, + { + "epoch": 0.22125, + "grad_norm": 0.858834806480359, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 22125 + }, + { + "epoch": 0.22126, + "grad_norm": 1.126327112589891, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 22126 + }, + { + "epoch": 0.22127, + "grad_norm": 1.053762375541193, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 22127 + }, + { + "epoch": 0.22128, + "grad_norm": 0.9320491937708714, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 22128 + }, + { + "epoch": 0.22129, + "grad_norm": 0.9231865776373712, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 22129 + }, + { + "epoch": 0.2213, + "grad_norm": 0.9667146918097497, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 22130 + }, + { + "epoch": 0.22131, + "grad_norm": 0.9729518931456379, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 22131 + }, + { + "epoch": 0.22132, + "grad_norm": 0.9121518921525817, + "learning_rate": 0.003, + "loss": 4.057, + "step": 22132 + }, + { + "epoch": 0.22133, + "grad_norm": 0.8370225880789848, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 22133 + }, + { + "epoch": 0.22134, + "grad_norm": 0.8354183493663939, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 22134 + }, + { + "epoch": 0.22135, + "grad_norm": 1.0130541711460181, + "learning_rate": 0.003, + "loss": 4.037, + "step": 22135 + }, + { + "epoch": 0.22136, + "grad_norm": 1.040596110292542, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 22136 + }, + { + "epoch": 0.22137, + "grad_norm": 1.0605796720384841, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 22137 + }, + { + "epoch": 0.22138, + "grad_norm": 0.9628032304889746, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 22138 + }, + { + "epoch": 0.22139, + "grad_norm": 0.9665042275463452, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 22139 + }, + { + "epoch": 0.2214, + "grad_norm": 1.1181353071397961, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 22140 + }, + { + "epoch": 0.22141, + "grad_norm": 1.0201701284349813, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 22141 + }, + { + "epoch": 0.22142, + "grad_norm": 1.042802377375285, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 22142 + }, + { + "epoch": 0.22143, + "grad_norm": 0.9934459441553016, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 22143 + }, + { + "epoch": 0.22144, + "grad_norm": 0.9959806335311557, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 22144 + }, + { + "epoch": 0.22145, + "grad_norm": 1.031315476389047, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 22145 + }, + { + "epoch": 0.22146, + "grad_norm": 0.8705023893606555, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 22146 + }, + { + "epoch": 0.22147, + "grad_norm": 0.9043040416893877, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 22147 + }, + { + "epoch": 0.22148, + "grad_norm": 1.0455612389309925, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 22148 + }, + { + "epoch": 0.22149, + "grad_norm": 0.9991875187307905, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 22149 + }, + { + "epoch": 0.2215, + "grad_norm": 1.1248142813176287, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 22150 + }, + { + "epoch": 0.22151, + "grad_norm": 0.997354313488082, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 22151 + }, + { + "epoch": 0.22152, + "grad_norm": 0.8840634081359512, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 22152 + }, + { + "epoch": 0.22153, + "grad_norm": 0.8126178788523708, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 22153 + }, + { + "epoch": 0.22154, + "grad_norm": 0.9061281939783663, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 22154 + }, + { + "epoch": 0.22155, + "grad_norm": 0.8304275294748221, + "learning_rate": 0.003, + "loss": 4.055, + "step": 22155 + }, + { + "epoch": 0.22156, + "grad_norm": 0.7216573910767039, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 22156 + }, + { + "epoch": 0.22157, + "grad_norm": 0.7816462253050105, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 22157 + }, + { + "epoch": 0.22158, + "grad_norm": 0.7675083120895178, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 22158 + }, + { + "epoch": 0.22159, + "grad_norm": 0.7596831584191603, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 22159 + }, + { + "epoch": 0.2216, + "grad_norm": 0.7975989080395691, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 22160 + }, + { + "epoch": 0.22161, + "grad_norm": 0.8317596997184951, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 22161 + }, + { + "epoch": 0.22162, + "grad_norm": 0.7711297528376383, + "learning_rate": 0.003, + "loss": 4.061, + "step": 22162 + }, + { + "epoch": 0.22163, + "grad_norm": 0.740565479950469, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 22163 + }, + { + "epoch": 0.22164, + "grad_norm": 0.7113503872288879, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 22164 + }, + { + "epoch": 0.22165, + "grad_norm": 0.7925254195358435, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 22165 + }, + { + "epoch": 0.22166, + "grad_norm": 1.0010967797337047, + "learning_rate": 0.003, + "loss": 4.039, + "step": 22166 + }, + { + "epoch": 0.22167, + "grad_norm": 1.3451264683620023, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 22167 + }, + { + "epoch": 0.22168, + "grad_norm": 0.6507696929249808, + "learning_rate": 0.003, + "loss": 4.032, + "step": 22168 + }, + { + "epoch": 0.22169, + "grad_norm": 0.5968651095924886, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 22169 + }, + { + "epoch": 0.2217, + "grad_norm": 0.6557308511079163, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 22170 + }, + { + "epoch": 0.22171, + "grad_norm": 0.7078889207658147, + "learning_rate": 0.003, + "loss": 4.068, + "step": 22171 + }, + { + "epoch": 0.22172, + "grad_norm": 0.7554749082054675, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 22172 + }, + { + "epoch": 0.22173, + "grad_norm": 0.9010140617183018, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 22173 + }, + { + "epoch": 0.22174, + "grad_norm": 1.0533805732761516, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 22174 + }, + { + "epoch": 0.22175, + "grad_norm": 1.0787792443321154, + "learning_rate": 0.003, + "loss": 4.042, + "step": 22175 + }, + { + "epoch": 0.22176, + "grad_norm": 0.8603706689604045, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 22176 + }, + { + "epoch": 0.22177, + "grad_norm": 0.8365128173918001, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 22177 + }, + { + "epoch": 0.22178, + "grad_norm": 0.9061165104715343, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 22178 + }, + { + "epoch": 0.22179, + "grad_norm": 1.0330629903860584, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 22179 + }, + { + "epoch": 0.2218, + "grad_norm": 1.0623081407534747, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 22180 + }, + { + "epoch": 0.22181, + "grad_norm": 0.9444235902192694, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 22181 + }, + { + "epoch": 0.22182, + "grad_norm": 0.9949618419516716, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 22182 + }, + { + "epoch": 0.22183, + "grad_norm": 0.9022154973254922, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 22183 + }, + { + "epoch": 0.22184, + "grad_norm": 0.8552683888003327, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 22184 + }, + { + "epoch": 0.22185, + "grad_norm": 0.7865121440982391, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 22185 + }, + { + "epoch": 0.22186, + "grad_norm": 0.6725445992678838, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 22186 + }, + { + "epoch": 0.22187, + "grad_norm": 0.6246129312732067, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 22187 + }, + { + "epoch": 0.22188, + "grad_norm": 0.5910546718489529, + "learning_rate": 0.003, + "loss": 4.079, + "step": 22188 + }, + { + "epoch": 0.22189, + "grad_norm": 0.7346404514035609, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 22189 + }, + { + "epoch": 0.2219, + "grad_norm": 0.8032220144636948, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 22190 + }, + { + "epoch": 0.22191, + "grad_norm": 0.9614117698813585, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 22191 + }, + { + "epoch": 0.22192, + "grad_norm": 1.225521655975889, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 22192 + }, + { + "epoch": 0.22193, + "grad_norm": 0.9097348342293494, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 22193 + }, + { + "epoch": 0.22194, + "grad_norm": 0.8712610714269252, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 22194 + }, + { + "epoch": 0.22195, + "grad_norm": 0.8691822225852284, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 22195 + }, + { + "epoch": 0.22196, + "grad_norm": 0.8836588709202576, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 22196 + }, + { + "epoch": 0.22197, + "grad_norm": 0.8829506610471086, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 22197 + }, + { + "epoch": 0.22198, + "grad_norm": 0.7561080502813564, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 22198 + }, + { + "epoch": 0.22199, + "grad_norm": 0.7797588713555024, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 22199 + }, + { + "epoch": 0.222, + "grad_norm": 0.8651115826878089, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 22200 + }, + { + "epoch": 0.22201, + "grad_norm": 1.126046976718936, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 22201 + }, + { + "epoch": 0.22202, + "grad_norm": 0.8855702016736842, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 22202 + }, + { + "epoch": 0.22203, + "grad_norm": 0.7567553761426342, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 22203 + }, + { + "epoch": 0.22204, + "grad_norm": 0.8060115105780602, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 22204 + }, + { + "epoch": 0.22205, + "grad_norm": 0.8958884789482368, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 22205 + }, + { + "epoch": 0.22206, + "grad_norm": 1.2045798161386938, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 22206 + }, + { + "epoch": 0.22207, + "grad_norm": 1.007792019711113, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 22207 + }, + { + "epoch": 0.22208, + "grad_norm": 1.040523550968073, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 22208 + }, + { + "epoch": 0.22209, + "grad_norm": 1.2164659225319387, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 22209 + }, + { + "epoch": 0.2221, + "grad_norm": 0.8150735851848918, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 22210 + }, + { + "epoch": 0.22211, + "grad_norm": 0.8732099062632754, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 22211 + }, + { + "epoch": 0.22212, + "grad_norm": 1.0572751149780963, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 22212 + }, + { + "epoch": 0.22213, + "grad_norm": 0.9445644027522035, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 22213 + }, + { + "epoch": 0.22214, + "grad_norm": 0.9461109439482839, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 22214 + }, + { + "epoch": 0.22215, + "grad_norm": 0.9757432408925696, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 22215 + }, + { + "epoch": 0.22216, + "grad_norm": 1.0913221516178517, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 22216 + }, + { + "epoch": 0.22217, + "grad_norm": 0.9322208231367511, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 22217 + }, + { + "epoch": 0.22218, + "grad_norm": 0.960267014589131, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 22218 + }, + { + "epoch": 0.22219, + "grad_norm": 0.9851270684249974, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 22219 + }, + { + "epoch": 0.2222, + "grad_norm": 0.8968376529284131, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 22220 + }, + { + "epoch": 0.22221, + "grad_norm": 0.8541058489244915, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 22221 + }, + { + "epoch": 0.22222, + "grad_norm": 0.7404623465811235, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 22222 + }, + { + "epoch": 0.22223, + "grad_norm": 0.6966837123317066, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 22223 + }, + { + "epoch": 0.22224, + "grad_norm": 0.774488663185445, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 22224 + }, + { + "epoch": 0.22225, + "grad_norm": 0.811676375788709, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 22225 + }, + { + "epoch": 0.22226, + "grad_norm": 0.8266184812123376, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 22226 + }, + { + "epoch": 0.22227, + "grad_norm": 0.7848063018162497, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 22227 + }, + { + "epoch": 0.22228, + "grad_norm": 0.7417912641251252, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 22228 + }, + { + "epoch": 0.22229, + "grad_norm": 0.7621330356709436, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 22229 + }, + { + "epoch": 0.2223, + "grad_norm": 0.7374006294003671, + "learning_rate": 0.003, + "loss": 4.031, + "step": 22230 + }, + { + "epoch": 0.22231, + "grad_norm": 0.7016098707997227, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 22231 + }, + { + "epoch": 0.22232, + "grad_norm": 0.7360385235382604, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 22232 + }, + { + "epoch": 0.22233, + "grad_norm": 0.8548974955039184, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 22233 + }, + { + "epoch": 0.22234, + "grad_norm": 1.0789988701948223, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 22234 + }, + { + "epoch": 0.22235, + "grad_norm": 1.1183440907226048, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 22235 + }, + { + "epoch": 0.22236, + "grad_norm": 0.9207834979644175, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 22236 + }, + { + "epoch": 0.22237, + "grad_norm": 0.970248701968993, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 22237 + }, + { + "epoch": 0.22238, + "grad_norm": 0.9743513122486656, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 22238 + }, + { + "epoch": 0.22239, + "grad_norm": 0.9505675812880745, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 22239 + }, + { + "epoch": 0.2224, + "grad_norm": 0.9109406895094778, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 22240 + }, + { + "epoch": 0.22241, + "grad_norm": 0.7591292533552716, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 22241 + }, + { + "epoch": 0.22242, + "grad_norm": 0.7535812438133108, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 22242 + }, + { + "epoch": 0.22243, + "grad_norm": 0.7913443728375273, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 22243 + }, + { + "epoch": 0.22244, + "grad_norm": 0.816978680641882, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 22244 + }, + { + "epoch": 0.22245, + "grad_norm": 0.8707553278244405, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 22245 + }, + { + "epoch": 0.22246, + "grad_norm": 0.9363695816754737, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 22246 + }, + { + "epoch": 0.22247, + "grad_norm": 1.106319931353203, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 22247 + }, + { + "epoch": 0.22248, + "grad_norm": 0.979389331760488, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 22248 + }, + { + "epoch": 0.22249, + "grad_norm": 1.0567714569546038, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 22249 + }, + { + "epoch": 0.2225, + "grad_norm": 1.025156301507456, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 22250 + }, + { + "epoch": 0.22251, + "grad_norm": 0.7958547830790944, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 22251 + }, + { + "epoch": 0.22252, + "grad_norm": 0.7616686415494771, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 22252 + }, + { + "epoch": 0.22253, + "grad_norm": 0.7836887868195285, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 22253 + }, + { + "epoch": 0.22254, + "grad_norm": 0.9174334860499777, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 22254 + }, + { + "epoch": 0.22255, + "grad_norm": 1.080705316356119, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 22255 + }, + { + "epoch": 0.22256, + "grad_norm": 1.02214815117429, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 22256 + }, + { + "epoch": 0.22257, + "grad_norm": 1.075300529619242, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 22257 + }, + { + "epoch": 0.22258, + "grad_norm": 0.9924569549639446, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 22258 + }, + { + "epoch": 0.22259, + "grad_norm": 0.9005378170256678, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 22259 + }, + { + "epoch": 0.2226, + "grad_norm": 0.7691204569824857, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 22260 + }, + { + "epoch": 0.22261, + "grad_norm": 0.7933742659065883, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 22261 + }, + { + "epoch": 0.22262, + "grad_norm": 1.0122217057773604, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 22262 + }, + { + "epoch": 0.22263, + "grad_norm": 0.980441500461147, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 22263 + }, + { + "epoch": 0.22264, + "grad_norm": 1.166041235904597, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 22264 + }, + { + "epoch": 0.22265, + "grad_norm": 0.9965704825137346, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 22265 + }, + { + "epoch": 0.22266, + "grad_norm": 1.092098565087222, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 22266 + }, + { + "epoch": 0.22267, + "grad_norm": 0.9158891393607803, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 22267 + }, + { + "epoch": 0.22268, + "grad_norm": 0.9339800562943872, + "learning_rate": 0.003, + "loss": 4.043, + "step": 22268 + }, + { + "epoch": 0.22269, + "grad_norm": 0.9070806760180086, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 22269 + }, + { + "epoch": 0.2227, + "grad_norm": 0.8135202578134365, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 22270 + }, + { + "epoch": 0.22271, + "grad_norm": 0.9029984110274643, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 22271 + }, + { + "epoch": 0.22272, + "grad_norm": 0.9702777636583725, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 22272 + }, + { + "epoch": 0.22273, + "grad_norm": 1.0855614221788181, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 22273 + }, + { + "epoch": 0.22274, + "grad_norm": 1.0976454744023618, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 22274 + }, + { + "epoch": 0.22275, + "grad_norm": 0.9330981263535711, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 22275 + }, + { + "epoch": 0.22276, + "grad_norm": 0.9970874853493966, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 22276 + }, + { + "epoch": 0.22277, + "grad_norm": 1.0498423082472537, + "learning_rate": 0.003, + "loss": 4.07, + "step": 22277 + }, + { + "epoch": 0.22278, + "grad_norm": 0.8679956379317704, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 22278 + }, + { + "epoch": 0.22279, + "grad_norm": 0.7125520883229424, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 22279 + }, + { + "epoch": 0.2228, + "grad_norm": 0.5870127924238849, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 22280 + }, + { + "epoch": 0.22281, + "grad_norm": 0.6582477572645474, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 22281 + }, + { + "epoch": 0.22282, + "grad_norm": 0.7250964131010654, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 22282 + }, + { + "epoch": 0.22283, + "grad_norm": 0.7702424253612006, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 22283 + }, + { + "epoch": 0.22284, + "grad_norm": 0.7128621458385906, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 22284 + }, + { + "epoch": 0.22285, + "grad_norm": 0.6641360527932991, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 22285 + }, + { + "epoch": 0.22286, + "grad_norm": 0.707612114099158, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 22286 + }, + { + "epoch": 0.22287, + "grad_norm": 0.7913411948093053, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 22287 + }, + { + "epoch": 0.22288, + "grad_norm": 0.8407537381088891, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 22288 + }, + { + "epoch": 0.22289, + "grad_norm": 0.800169299113007, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 22289 + }, + { + "epoch": 0.2229, + "grad_norm": 0.7528566573198635, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 22290 + }, + { + "epoch": 0.22291, + "grad_norm": 0.7436393659146529, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 22291 + }, + { + "epoch": 0.22292, + "grad_norm": 0.7014270350756187, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 22292 + }, + { + "epoch": 0.22293, + "grad_norm": 0.926045097420717, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 22293 + }, + { + "epoch": 0.22294, + "grad_norm": 1.252627835692378, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 22294 + }, + { + "epoch": 0.22295, + "grad_norm": 0.8996018601282918, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 22295 + }, + { + "epoch": 0.22296, + "grad_norm": 0.8787044778209848, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 22296 + }, + { + "epoch": 0.22297, + "grad_norm": 0.9833706736631922, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 22297 + }, + { + "epoch": 0.22298, + "grad_norm": 1.165494918084471, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 22298 + }, + { + "epoch": 0.22299, + "grad_norm": 0.9329827524970048, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 22299 + }, + { + "epoch": 0.223, + "grad_norm": 0.8801949822446797, + "learning_rate": 0.003, + "loss": 4.067, + "step": 22300 + }, + { + "epoch": 0.22301, + "grad_norm": 0.9553959724873491, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 22301 + }, + { + "epoch": 0.22302, + "grad_norm": 0.9064785776113735, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 22302 + }, + { + "epoch": 0.22303, + "grad_norm": 1.0508090619487611, + "learning_rate": 0.003, + "loss": 3.9996, + "step": 22303 + }, + { + "epoch": 0.22304, + "grad_norm": 1.120638968298277, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 22304 + }, + { + "epoch": 0.22305, + "grad_norm": 0.9500244501742219, + "learning_rate": 0.003, + "loss": 4.048, + "step": 22305 + }, + { + "epoch": 0.22306, + "grad_norm": 0.9535745268901713, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 22306 + }, + { + "epoch": 0.22307, + "grad_norm": 1.0857787682830338, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 22307 + }, + { + "epoch": 0.22308, + "grad_norm": 0.9679996787972609, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 22308 + }, + { + "epoch": 0.22309, + "grad_norm": 1.186965179024597, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 22309 + }, + { + "epoch": 0.2231, + "grad_norm": 0.8625048381922663, + "learning_rate": 0.003, + "loss": 4.06, + "step": 22310 + }, + { + "epoch": 0.22311, + "grad_norm": 1.0154775627960273, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 22311 + }, + { + "epoch": 0.22312, + "grad_norm": 0.960274294556906, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 22312 + }, + { + "epoch": 0.22313, + "grad_norm": 0.7665702901480782, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 22313 + }, + { + "epoch": 0.22314, + "grad_norm": 0.6959927651351643, + "learning_rate": 0.003, + "loss": 4.045, + "step": 22314 + }, + { + "epoch": 0.22315, + "grad_norm": 0.6492558182809137, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 22315 + }, + { + "epoch": 0.22316, + "grad_norm": 0.6555061392730082, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 22316 + }, + { + "epoch": 0.22317, + "grad_norm": 0.7549157799869123, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 22317 + }, + { + "epoch": 0.22318, + "grad_norm": 1.0218088420765945, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 22318 + }, + { + "epoch": 0.22319, + "grad_norm": 1.2023068800123458, + "learning_rate": 0.003, + "loss": 4.074, + "step": 22319 + }, + { + "epoch": 0.2232, + "grad_norm": 0.8956242357097777, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 22320 + }, + { + "epoch": 0.22321, + "grad_norm": 0.7921918487447402, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 22321 + }, + { + "epoch": 0.22322, + "grad_norm": 0.6628578418509272, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 22322 + }, + { + "epoch": 0.22323, + "grad_norm": 0.7319420594175866, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 22323 + }, + { + "epoch": 0.22324, + "grad_norm": 0.7298659933019586, + "learning_rate": 0.003, + "loss": 4.043, + "step": 22324 + }, + { + "epoch": 0.22325, + "grad_norm": 0.7176008055459516, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 22325 + }, + { + "epoch": 0.22326, + "grad_norm": 0.6626282350332566, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 22326 + }, + { + "epoch": 0.22327, + "grad_norm": 0.7111288543135935, + "learning_rate": 0.003, + "loss": 4.029, + "step": 22327 + }, + { + "epoch": 0.22328, + "grad_norm": 0.9463900575615173, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 22328 + }, + { + "epoch": 0.22329, + "grad_norm": 1.2562194997330671, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 22329 + }, + { + "epoch": 0.2233, + "grad_norm": 0.8636073504667185, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 22330 + }, + { + "epoch": 0.22331, + "grad_norm": 0.9206882770660257, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 22331 + }, + { + "epoch": 0.22332, + "grad_norm": 1.046592091410422, + "learning_rate": 0.003, + "loss": 4.047, + "step": 22332 + }, + { + "epoch": 0.22333, + "grad_norm": 1.0462322944588742, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 22333 + }, + { + "epoch": 0.22334, + "grad_norm": 0.9020284414609357, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 22334 + }, + { + "epoch": 0.22335, + "grad_norm": 0.9947951804893863, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 22335 + }, + { + "epoch": 0.22336, + "grad_norm": 1.1289436311660437, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 22336 + }, + { + "epoch": 0.22337, + "grad_norm": 0.8530710652814224, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 22337 + }, + { + "epoch": 0.22338, + "grad_norm": 0.7814639217105689, + "learning_rate": 0.003, + "loss": 4.057, + "step": 22338 + }, + { + "epoch": 0.22339, + "grad_norm": 0.7705744082309481, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 22339 + }, + { + "epoch": 0.2234, + "grad_norm": 0.8193633326782319, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 22340 + }, + { + "epoch": 0.22341, + "grad_norm": 0.7376377219136407, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 22341 + }, + { + "epoch": 0.22342, + "grad_norm": 0.667837025005811, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 22342 + }, + { + "epoch": 0.22343, + "grad_norm": 0.8372182190133747, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 22343 + }, + { + "epoch": 0.22344, + "grad_norm": 1.0620723750953345, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 22344 + }, + { + "epoch": 0.22345, + "grad_norm": 1.1000061697785937, + "learning_rate": 0.003, + "loss": 4.054, + "step": 22345 + }, + { + "epoch": 0.22346, + "grad_norm": 0.803681570051572, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 22346 + }, + { + "epoch": 0.22347, + "grad_norm": 0.7678607546404127, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 22347 + }, + { + "epoch": 0.22348, + "grad_norm": 0.8277801165826847, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 22348 + }, + { + "epoch": 0.22349, + "grad_norm": 0.826734993600233, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 22349 + }, + { + "epoch": 0.2235, + "grad_norm": 0.8141172163433259, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 22350 + }, + { + "epoch": 0.22351, + "grad_norm": 0.8259237527287515, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 22351 + }, + { + "epoch": 0.22352, + "grad_norm": 0.9044324309234641, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 22352 + }, + { + "epoch": 0.22353, + "grad_norm": 1.1403936295084518, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 22353 + }, + { + "epoch": 0.22354, + "grad_norm": 1.1350006475764947, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 22354 + }, + { + "epoch": 0.22355, + "grad_norm": 0.9425527216357381, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 22355 + }, + { + "epoch": 0.22356, + "grad_norm": 1.1378720385781806, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 22356 + }, + { + "epoch": 0.22357, + "grad_norm": 1.2487405374216476, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 22357 + }, + { + "epoch": 0.22358, + "grad_norm": 0.9463478737352174, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 22358 + }, + { + "epoch": 0.22359, + "grad_norm": 1.0592010575135142, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 22359 + }, + { + "epoch": 0.2236, + "grad_norm": 1.035350133893251, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 22360 + }, + { + "epoch": 0.22361, + "grad_norm": 0.8917000145572558, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 22361 + }, + { + "epoch": 0.22362, + "grad_norm": 0.8603779596426748, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 22362 + }, + { + "epoch": 0.22363, + "grad_norm": 0.8644951247027882, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 22363 + }, + { + "epoch": 0.22364, + "grad_norm": 1.0618178088122887, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 22364 + }, + { + "epoch": 0.22365, + "grad_norm": 0.9393762913581301, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 22365 + }, + { + "epoch": 0.22366, + "grad_norm": 0.804479893219825, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 22366 + }, + { + "epoch": 0.22367, + "grad_norm": 0.771663889860158, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 22367 + }, + { + "epoch": 0.22368, + "grad_norm": 0.7346449402990187, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 22368 + }, + { + "epoch": 0.22369, + "grad_norm": 0.8186387414665576, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 22369 + }, + { + "epoch": 0.2237, + "grad_norm": 0.952177223972269, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 22370 + }, + { + "epoch": 0.22371, + "grad_norm": 1.359945589997042, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 22371 + }, + { + "epoch": 0.22372, + "grad_norm": 0.8012654112858634, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 22372 + }, + { + "epoch": 0.22373, + "grad_norm": 0.7356740837704214, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 22373 + }, + { + "epoch": 0.22374, + "grad_norm": 0.6949995479371482, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 22374 + }, + { + "epoch": 0.22375, + "grad_norm": 0.7597166674084919, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 22375 + }, + { + "epoch": 0.22376, + "grad_norm": 0.7710603796693849, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 22376 + }, + { + "epoch": 0.22377, + "grad_norm": 0.8474986295447465, + "learning_rate": 0.003, + "loss": 4.006, + "step": 22377 + }, + { + "epoch": 0.22378, + "grad_norm": 0.9429507591970833, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 22378 + }, + { + "epoch": 0.22379, + "grad_norm": 1.1603650634506242, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 22379 + }, + { + "epoch": 0.2238, + "grad_norm": 1.1322971081235735, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 22380 + }, + { + "epoch": 0.22381, + "grad_norm": 0.8954707528543132, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 22381 + }, + { + "epoch": 0.22382, + "grad_norm": 0.7590314448402322, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 22382 + }, + { + "epoch": 0.22383, + "grad_norm": 0.7082814535073892, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 22383 + }, + { + "epoch": 0.22384, + "grad_norm": 0.6869516842089984, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 22384 + }, + { + "epoch": 0.22385, + "grad_norm": 0.6634108021481424, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 22385 + }, + { + "epoch": 0.22386, + "grad_norm": 0.6235367990405143, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 22386 + }, + { + "epoch": 0.22387, + "grad_norm": 0.596753933968273, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 22387 + }, + { + "epoch": 0.22388, + "grad_norm": 0.6086483230944381, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 22388 + }, + { + "epoch": 0.22389, + "grad_norm": 0.6597716993030494, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 22389 + }, + { + "epoch": 0.2239, + "grad_norm": 0.9004788820746377, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 22390 + }, + { + "epoch": 0.22391, + "grad_norm": 1.199098777885385, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 22391 + }, + { + "epoch": 0.22392, + "grad_norm": 0.8767882068154063, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 22392 + }, + { + "epoch": 0.22393, + "grad_norm": 0.8482362624263489, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 22393 + }, + { + "epoch": 0.22394, + "grad_norm": 0.8405518808929641, + "learning_rate": 0.003, + "loss": 4.045, + "step": 22394 + }, + { + "epoch": 0.22395, + "grad_norm": 0.8317343879495549, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 22395 + }, + { + "epoch": 0.22396, + "grad_norm": 0.7761655127230493, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 22396 + }, + { + "epoch": 0.22397, + "grad_norm": 0.8056297916851944, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 22397 + }, + { + "epoch": 0.22398, + "grad_norm": 0.8890216468949055, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 22398 + }, + { + "epoch": 0.22399, + "grad_norm": 1.0340271312224014, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 22399 + }, + { + "epoch": 0.224, + "grad_norm": 1.1423192226735999, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 22400 + }, + { + "epoch": 0.22401, + "grad_norm": 0.9970181720749213, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 22401 + }, + { + "epoch": 0.22402, + "grad_norm": 0.9941213546991586, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 22402 + }, + { + "epoch": 0.22403, + "grad_norm": 1.0494083599481703, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 22403 + }, + { + "epoch": 0.22404, + "grad_norm": 0.9947243299248854, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 22404 + }, + { + "epoch": 0.22405, + "grad_norm": 1.048932806997926, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 22405 + }, + { + "epoch": 0.22406, + "grad_norm": 0.9829655524326324, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 22406 + }, + { + "epoch": 0.22407, + "grad_norm": 0.983543152382307, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 22407 + }, + { + "epoch": 0.22408, + "grad_norm": 1.0295198385116437, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 22408 + }, + { + "epoch": 0.22409, + "grad_norm": 0.7879879786122067, + "learning_rate": 0.003, + "loss": 4.074, + "step": 22409 + }, + { + "epoch": 0.2241, + "grad_norm": 0.6920471352836073, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 22410 + }, + { + "epoch": 0.22411, + "grad_norm": 0.7335191785731368, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 22411 + }, + { + "epoch": 0.22412, + "grad_norm": 0.8874712535614557, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 22412 + }, + { + "epoch": 0.22413, + "grad_norm": 1.375623960449763, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 22413 + }, + { + "epoch": 0.22414, + "grad_norm": 0.7614122022734574, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 22414 + }, + { + "epoch": 0.22415, + "grad_norm": 0.6763293359616619, + "learning_rate": 0.003, + "loss": 4.024, + "step": 22415 + }, + { + "epoch": 0.22416, + "grad_norm": 0.7665652273534039, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 22416 + }, + { + "epoch": 0.22417, + "grad_norm": 0.8542468855922213, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 22417 + }, + { + "epoch": 0.22418, + "grad_norm": 1.0406223562079902, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 22418 + }, + { + "epoch": 0.22419, + "grad_norm": 0.9816528431228511, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 22419 + }, + { + "epoch": 0.2242, + "grad_norm": 0.9367438140124987, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 22420 + }, + { + "epoch": 0.22421, + "grad_norm": 0.9813251260574493, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 22421 + }, + { + "epoch": 0.22422, + "grad_norm": 1.037303855142213, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 22422 + }, + { + "epoch": 0.22423, + "grad_norm": 1.2849847322169115, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 22423 + }, + { + "epoch": 0.22424, + "grad_norm": 0.9342386198093378, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 22424 + }, + { + "epoch": 0.22425, + "grad_norm": 0.885714215238561, + "learning_rate": 0.003, + "loss": 4.05, + "step": 22425 + }, + { + "epoch": 0.22426, + "grad_norm": 0.8882791322146617, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 22426 + }, + { + "epoch": 0.22427, + "grad_norm": 0.8551045797536737, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 22427 + }, + { + "epoch": 0.22428, + "grad_norm": 0.9051532030889349, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 22428 + }, + { + "epoch": 0.22429, + "grad_norm": 0.911895252037181, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 22429 + }, + { + "epoch": 0.2243, + "grad_norm": 0.9374602615606485, + "learning_rate": 0.003, + "loss": 4.05, + "step": 22430 + }, + { + "epoch": 0.22431, + "grad_norm": 1.0820237191025959, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 22431 + }, + { + "epoch": 0.22432, + "grad_norm": 1.0842745461727958, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 22432 + }, + { + "epoch": 0.22433, + "grad_norm": 1.0726745386657761, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 22433 + }, + { + "epoch": 0.22434, + "grad_norm": 1.2787279423021063, + "learning_rate": 0.003, + "loss": 4.064, + "step": 22434 + }, + { + "epoch": 0.22435, + "grad_norm": 0.9446789932747638, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 22435 + }, + { + "epoch": 0.22436, + "grad_norm": 0.8495633647217364, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 22436 + }, + { + "epoch": 0.22437, + "grad_norm": 0.8257725090767902, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 22437 + }, + { + "epoch": 0.22438, + "grad_norm": 0.8330179977379383, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 22438 + }, + { + "epoch": 0.22439, + "grad_norm": 0.7369863711056917, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 22439 + }, + { + "epoch": 0.2244, + "grad_norm": 0.760668533826531, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 22440 + }, + { + "epoch": 0.22441, + "grad_norm": 0.7756009370562948, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 22441 + }, + { + "epoch": 0.22442, + "grad_norm": 0.8159686665261338, + "learning_rate": 0.003, + "loss": 4.077, + "step": 22442 + }, + { + "epoch": 0.22443, + "grad_norm": 0.7695657106284184, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 22443 + }, + { + "epoch": 0.22444, + "grad_norm": 0.767361157660122, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 22444 + }, + { + "epoch": 0.22445, + "grad_norm": 0.6751116306296152, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 22445 + }, + { + "epoch": 0.22446, + "grad_norm": 0.7111547860169535, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 22446 + }, + { + "epoch": 0.22447, + "grad_norm": 0.8027817595233157, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 22447 + }, + { + "epoch": 0.22448, + "grad_norm": 0.9619935572938082, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 22448 + }, + { + "epoch": 0.22449, + "grad_norm": 1.4128354503813663, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 22449 + }, + { + "epoch": 0.2245, + "grad_norm": 0.8199746721448757, + "learning_rate": 0.003, + "loss": 4.068, + "step": 22450 + }, + { + "epoch": 0.22451, + "grad_norm": 0.7376818926237061, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 22451 + }, + { + "epoch": 0.22452, + "grad_norm": 0.7225765552580602, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 22452 + }, + { + "epoch": 0.22453, + "grad_norm": 0.674240454017934, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 22453 + }, + { + "epoch": 0.22454, + "grad_norm": 0.597986516205863, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 22454 + }, + { + "epoch": 0.22455, + "grad_norm": 0.6657223122003431, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 22455 + }, + { + "epoch": 0.22456, + "grad_norm": 0.6998662127518855, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 22456 + }, + { + "epoch": 0.22457, + "grad_norm": 0.6879094052363666, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 22457 + }, + { + "epoch": 0.22458, + "grad_norm": 0.7387644489815635, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 22458 + }, + { + "epoch": 0.22459, + "grad_norm": 0.9012618934900345, + "learning_rate": 0.003, + "loss": 4.03, + "step": 22459 + }, + { + "epoch": 0.2246, + "grad_norm": 1.064742311103599, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 22460 + }, + { + "epoch": 0.22461, + "grad_norm": 1.0128085301764587, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 22461 + }, + { + "epoch": 0.22462, + "grad_norm": 0.9776856701081688, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 22462 + }, + { + "epoch": 0.22463, + "grad_norm": 1.0983449629603956, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 22463 + }, + { + "epoch": 0.22464, + "grad_norm": 1.050475174352711, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 22464 + }, + { + "epoch": 0.22465, + "grad_norm": 1.0195367633204806, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 22465 + }, + { + "epoch": 0.22466, + "grad_norm": 1.110525210294342, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 22466 + }, + { + "epoch": 0.22467, + "grad_norm": 0.9653978000317589, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 22467 + }, + { + "epoch": 0.22468, + "grad_norm": 0.8729096779729661, + "learning_rate": 0.003, + "loss": 4.053, + "step": 22468 + }, + { + "epoch": 0.22469, + "grad_norm": 0.8533826002763989, + "learning_rate": 0.003, + "loss": 4.084, + "step": 22469 + }, + { + "epoch": 0.2247, + "grad_norm": 0.8661519660377732, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 22470 + }, + { + "epoch": 0.22471, + "grad_norm": 0.9454466183737082, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 22471 + }, + { + "epoch": 0.22472, + "grad_norm": 0.9044980396172726, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 22472 + }, + { + "epoch": 0.22473, + "grad_norm": 0.7838565573244917, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 22473 + }, + { + "epoch": 0.22474, + "grad_norm": 0.7681634831293778, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 22474 + }, + { + "epoch": 0.22475, + "grad_norm": 0.8265672278491937, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 22475 + }, + { + "epoch": 0.22476, + "grad_norm": 0.9408747466032825, + "learning_rate": 0.003, + "loss": 4.071, + "step": 22476 + }, + { + "epoch": 0.22477, + "grad_norm": 1.147863715871065, + "learning_rate": 0.003, + "loss": 4.037, + "step": 22477 + }, + { + "epoch": 0.22478, + "grad_norm": 0.9002003248387239, + "learning_rate": 0.003, + "loss": 4.066, + "step": 22478 + }, + { + "epoch": 0.22479, + "grad_norm": 0.8873843965061498, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 22479 + }, + { + "epoch": 0.2248, + "grad_norm": 0.8883603408810261, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 22480 + }, + { + "epoch": 0.22481, + "grad_norm": 1.0005760126302268, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 22481 + }, + { + "epoch": 0.22482, + "grad_norm": 1.0165572050469156, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 22482 + }, + { + "epoch": 0.22483, + "grad_norm": 1.0239835058448845, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 22483 + }, + { + "epoch": 0.22484, + "grad_norm": 0.8460874881824254, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 22484 + }, + { + "epoch": 0.22485, + "grad_norm": 0.7975947538485714, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 22485 + }, + { + "epoch": 0.22486, + "grad_norm": 0.8657508696457858, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 22486 + }, + { + "epoch": 0.22487, + "grad_norm": 0.8371720375375046, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 22487 + }, + { + "epoch": 0.22488, + "grad_norm": 0.8910251004878182, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 22488 + }, + { + "epoch": 0.22489, + "grad_norm": 0.8426074338206573, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 22489 + }, + { + "epoch": 0.2249, + "grad_norm": 1.115577403561262, + "learning_rate": 0.003, + "loss": 4.07, + "step": 22490 + }, + { + "epoch": 0.22491, + "grad_norm": 1.0797138636103445, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 22491 + }, + { + "epoch": 0.22492, + "grad_norm": 1.0302809791529903, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 22492 + }, + { + "epoch": 0.22493, + "grad_norm": 1.100784740632589, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 22493 + }, + { + "epoch": 0.22494, + "grad_norm": 1.024358751639694, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 22494 + }, + { + "epoch": 0.22495, + "grad_norm": 1.10120163146534, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 22495 + }, + { + "epoch": 0.22496, + "grad_norm": 0.8278327030218051, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 22496 + }, + { + "epoch": 0.22497, + "grad_norm": 0.7788613330822212, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 22497 + }, + { + "epoch": 0.22498, + "grad_norm": 0.8247215002733669, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 22498 + }, + { + "epoch": 0.22499, + "grad_norm": 0.8253248770858604, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 22499 + }, + { + "epoch": 0.225, + "grad_norm": 0.6720663708044156, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 22500 + }, + { + "epoch": 0.22501, + "grad_norm": 0.6513400309322824, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 22501 + }, + { + "epoch": 0.22502, + "grad_norm": 0.6121646358825221, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 22502 + }, + { + "epoch": 0.22503, + "grad_norm": 0.574340405763607, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 22503 + }, + { + "epoch": 0.22504, + "grad_norm": 0.6170797344207057, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 22504 + }, + { + "epoch": 0.22505, + "grad_norm": 0.6806065791666323, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 22505 + }, + { + "epoch": 0.22506, + "grad_norm": 0.7932940420826946, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 22506 + }, + { + "epoch": 0.22507, + "grad_norm": 0.990946882618376, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 22507 + }, + { + "epoch": 0.22508, + "grad_norm": 1.0732940369211845, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 22508 + }, + { + "epoch": 0.22509, + "grad_norm": 0.9824475680713082, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 22509 + }, + { + "epoch": 0.2251, + "grad_norm": 0.8524620174114014, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 22510 + }, + { + "epoch": 0.22511, + "grad_norm": 0.8294394869232353, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 22511 + }, + { + "epoch": 0.22512, + "grad_norm": 0.8132779241261927, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 22512 + }, + { + "epoch": 0.22513, + "grad_norm": 0.8915216223813818, + "learning_rate": 0.003, + "loss": 4.04, + "step": 22513 + }, + { + "epoch": 0.22514, + "grad_norm": 0.9747971362160897, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 22514 + }, + { + "epoch": 0.22515, + "grad_norm": 1.2165438467058163, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 22515 + }, + { + "epoch": 0.22516, + "grad_norm": 0.8619242201440362, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 22516 + }, + { + "epoch": 0.22517, + "grad_norm": 0.8013797388295194, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 22517 + }, + { + "epoch": 0.22518, + "grad_norm": 0.745392620483749, + "learning_rate": 0.003, + "loss": 4.041, + "step": 22518 + }, + { + "epoch": 0.22519, + "grad_norm": 0.7247632398876357, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 22519 + }, + { + "epoch": 0.2252, + "grad_norm": 0.7190065003349567, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 22520 + }, + { + "epoch": 0.22521, + "grad_norm": 0.7683930864661954, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 22521 + }, + { + "epoch": 0.22522, + "grad_norm": 0.7574259775700574, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 22522 + }, + { + "epoch": 0.22523, + "grad_norm": 0.8059692805691367, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 22523 + }, + { + "epoch": 0.22524, + "grad_norm": 1.0352279462374505, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 22524 + }, + { + "epoch": 0.22525, + "grad_norm": 1.0199182312852217, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 22525 + }, + { + "epoch": 0.22526, + "grad_norm": 0.9635404966026307, + "learning_rate": 0.003, + "loss": 4.075, + "step": 22526 + }, + { + "epoch": 0.22527, + "grad_norm": 1.073452804157034, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 22527 + }, + { + "epoch": 0.22528, + "grad_norm": 1.057365274750992, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 22528 + }, + { + "epoch": 0.22529, + "grad_norm": 0.9664730460758946, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 22529 + }, + { + "epoch": 0.2253, + "grad_norm": 1.0122980328690805, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 22530 + }, + { + "epoch": 0.22531, + "grad_norm": 1.1736123434171162, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 22531 + }, + { + "epoch": 0.22532, + "grad_norm": 0.9629095351500622, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 22532 + }, + { + "epoch": 0.22533, + "grad_norm": 0.945575275575981, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 22533 + }, + { + "epoch": 0.22534, + "grad_norm": 0.9560840396590854, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 22534 + }, + { + "epoch": 0.22535, + "grad_norm": 0.957441548761198, + "learning_rate": 0.003, + "loss": 4.071, + "step": 22535 + }, + { + "epoch": 0.22536, + "grad_norm": 1.0119666166341508, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 22536 + }, + { + "epoch": 0.22537, + "grad_norm": 0.8912505217717467, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 22537 + }, + { + "epoch": 0.22538, + "grad_norm": 0.9783019584446301, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 22538 + }, + { + "epoch": 0.22539, + "grad_norm": 0.9776628271835963, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 22539 + }, + { + "epoch": 0.2254, + "grad_norm": 0.9673462477124672, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 22540 + }, + { + "epoch": 0.22541, + "grad_norm": 0.8602254145094855, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 22541 + }, + { + "epoch": 0.22542, + "grad_norm": 0.8628526849546995, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 22542 + }, + { + "epoch": 0.22543, + "grad_norm": 0.7917171637841883, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 22543 + }, + { + "epoch": 0.22544, + "grad_norm": 0.7889094961787414, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 22544 + }, + { + "epoch": 0.22545, + "grad_norm": 0.8723715399170936, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 22545 + }, + { + "epoch": 0.22546, + "grad_norm": 1.1398832572415534, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 22546 + }, + { + "epoch": 0.22547, + "grad_norm": 0.9842249532223255, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 22547 + }, + { + "epoch": 0.22548, + "grad_norm": 0.9505917507656427, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 22548 + }, + { + "epoch": 0.22549, + "grad_norm": 0.9464247013152843, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 22549 + }, + { + "epoch": 0.2255, + "grad_norm": 0.8695449853537233, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 22550 + }, + { + "epoch": 0.22551, + "grad_norm": 0.931958727355571, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 22551 + }, + { + "epoch": 0.22552, + "grad_norm": 1.0083673884591295, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 22552 + }, + { + "epoch": 0.22553, + "grad_norm": 1.0028389181120434, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 22553 + }, + { + "epoch": 0.22554, + "grad_norm": 1.1772338924028771, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 22554 + }, + { + "epoch": 0.22555, + "grad_norm": 0.965199777169094, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 22555 + }, + { + "epoch": 0.22556, + "grad_norm": 0.9389978371710452, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 22556 + }, + { + "epoch": 0.22557, + "grad_norm": 0.9581433870462375, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 22557 + }, + { + "epoch": 0.22558, + "grad_norm": 1.0140599141129432, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 22558 + }, + { + "epoch": 0.22559, + "grad_norm": 0.8604015323152806, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 22559 + }, + { + "epoch": 0.2256, + "grad_norm": 0.8166746744255023, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 22560 + }, + { + "epoch": 0.22561, + "grad_norm": 0.8587403428794941, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 22561 + }, + { + "epoch": 0.22562, + "grad_norm": 0.8776156589332852, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 22562 + }, + { + "epoch": 0.22563, + "grad_norm": 0.9450824447850328, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 22563 + }, + { + "epoch": 0.22564, + "grad_norm": 0.9561883635352979, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 22564 + }, + { + "epoch": 0.22565, + "grad_norm": 0.9882176140159471, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 22565 + }, + { + "epoch": 0.22566, + "grad_norm": 0.8135825613386174, + "learning_rate": 0.003, + "loss": 4.06, + "step": 22566 + }, + { + "epoch": 0.22567, + "grad_norm": 0.7347851553409692, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 22567 + }, + { + "epoch": 0.22568, + "grad_norm": 0.690799113498159, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 22568 + }, + { + "epoch": 0.22569, + "grad_norm": 0.7162221793077578, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 22569 + }, + { + "epoch": 0.2257, + "grad_norm": 0.6806179485820844, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 22570 + }, + { + "epoch": 0.22571, + "grad_norm": 0.7216893418779671, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 22571 + }, + { + "epoch": 0.22572, + "grad_norm": 0.9024696562772438, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 22572 + }, + { + "epoch": 0.22573, + "grad_norm": 1.2127673296201573, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 22573 + }, + { + "epoch": 0.22574, + "grad_norm": 0.8479077283209163, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 22574 + }, + { + "epoch": 0.22575, + "grad_norm": 0.9083264485051574, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 22575 + }, + { + "epoch": 0.22576, + "grad_norm": 0.9839362927703458, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 22576 + }, + { + "epoch": 0.22577, + "grad_norm": 0.9682398027742857, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 22577 + }, + { + "epoch": 0.22578, + "grad_norm": 0.785383788338004, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 22578 + }, + { + "epoch": 0.22579, + "grad_norm": 0.773265432136049, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 22579 + }, + { + "epoch": 0.2258, + "grad_norm": 0.8374900736427304, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 22580 + }, + { + "epoch": 0.22581, + "grad_norm": 1.1235541656684023, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 22581 + }, + { + "epoch": 0.22582, + "grad_norm": 0.9878613728351212, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 22582 + }, + { + "epoch": 0.22583, + "grad_norm": 0.9097155948238201, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 22583 + }, + { + "epoch": 0.22584, + "grad_norm": 0.7342963290879935, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 22584 + }, + { + "epoch": 0.22585, + "grad_norm": 0.7310469446253919, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 22585 + }, + { + "epoch": 0.22586, + "grad_norm": 0.6564935680284889, + "learning_rate": 0.003, + "loss": 4.064, + "step": 22586 + }, + { + "epoch": 0.22587, + "grad_norm": 0.5752614337517873, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 22587 + }, + { + "epoch": 0.22588, + "grad_norm": 0.6277696751087115, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 22588 + }, + { + "epoch": 0.22589, + "grad_norm": 0.647207836677466, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 22589 + }, + { + "epoch": 0.2259, + "grad_norm": 0.6195436317102999, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 22590 + }, + { + "epoch": 0.22591, + "grad_norm": 0.6230322544870034, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 22591 + }, + { + "epoch": 0.22592, + "grad_norm": 0.8071873172682745, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 22592 + }, + { + "epoch": 0.22593, + "grad_norm": 1.1438259845801872, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 22593 + }, + { + "epoch": 0.22594, + "grad_norm": 1.1416447741204918, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 22594 + }, + { + "epoch": 0.22595, + "grad_norm": 0.6809705937637491, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 22595 + }, + { + "epoch": 0.22596, + "grad_norm": 0.6218388583904539, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 22596 + }, + { + "epoch": 0.22597, + "grad_norm": 0.632011795915333, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 22597 + }, + { + "epoch": 0.22598, + "grad_norm": 0.6833971399491734, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 22598 + }, + { + "epoch": 0.22599, + "grad_norm": 1.047985364137886, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 22599 + }, + { + "epoch": 0.226, + "grad_norm": 1.2289694623537284, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 22600 + }, + { + "epoch": 0.22601, + "grad_norm": 0.7802878888261349, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 22601 + }, + { + "epoch": 0.22602, + "grad_norm": 0.7654255100307006, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 22602 + }, + { + "epoch": 0.22603, + "grad_norm": 0.7742124186013298, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 22603 + }, + { + "epoch": 0.22604, + "grad_norm": 1.0025761483727609, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 22604 + }, + { + "epoch": 0.22605, + "grad_norm": 1.4886545270918778, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 22605 + }, + { + "epoch": 0.22606, + "grad_norm": 0.6736633516094532, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 22606 + }, + { + "epoch": 0.22607, + "grad_norm": 0.8297948904732105, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 22607 + }, + { + "epoch": 0.22608, + "grad_norm": 0.9367644543220012, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 22608 + }, + { + "epoch": 0.22609, + "grad_norm": 1.1171275809050112, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 22609 + }, + { + "epoch": 0.2261, + "grad_norm": 0.9522670237318388, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 22610 + }, + { + "epoch": 0.22611, + "grad_norm": 0.9296105377518948, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 22611 + }, + { + "epoch": 0.22612, + "grad_norm": 1.0450105775301994, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 22612 + }, + { + "epoch": 0.22613, + "grad_norm": 0.9261372014122379, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 22613 + }, + { + "epoch": 0.22614, + "grad_norm": 0.9902630454626118, + "learning_rate": 0.003, + "loss": 4.069, + "step": 22614 + }, + { + "epoch": 0.22615, + "grad_norm": 0.977385652722921, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 22615 + }, + { + "epoch": 0.22616, + "grad_norm": 0.9091130785402171, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 22616 + }, + { + "epoch": 0.22617, + "grad_norm": 1.0163270259028543, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 22617 + }, + { + "epoch": 0.22618, + "grad_norm": 1.2031937925341383, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 22618 + }, + { + "epoch": 0.22619, + "grad_norm": 1.0041494756507696, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 22619 + }, + { + "epoch": 0.2262, + "grad_norm": 0.9619042395614545, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 22620 + }, + { + "epoch": 0.22621, + "grad_norm": 0.9129350339396065, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 22621 + }, + { + "epoch": 0.22622, + "grad_norm": 1.0095613330379023, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 22622 + }, + { + "epoch": 0.22623, + "grad_norm": 1.1949991306262393, + "learning_rate": 0.003, + "loss": 4.051, + "step": 22623 + }, + { + "epoch": 0.22624, + "grad_norm": 0.8963117514310439, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 22624 + }, + { + "epoch": 0.22625, + "grad_norm": 1.0232834739506285, + "learning_rate": 0.003, + "loss": 4.066, + "step": 22625 + }, + { + "epoch": 0.22626, + "grad_norm": 0.911278980625608, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 22626 + }, + { + "epoch": 0.22627, + "grad_norm": 0.9805460639855323, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 22627 + }, + { + "epoch": 0.22628, + "grad_norm": 0.9053700926917654, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 22628 + }, + { + "epoch": 0.22629, + "grad_norm": 0.8838639267749948, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 22629 + }, + { + "epoch": 0.2263, + "grad_norm": 0.9692285526666109, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 22630 + }, + { + "epoch": 0.22631, + "grad_norm": 1.0028287599238581, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 22631 + }, + { + "epoch": 0.22632, + "grad_norm": 0.9011479773239284, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 22632 + }, + { + "epoch": 0.22633, + "grad_norm": 0.8612664806624377, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 22633 + }, + { + "epoch": 0.22634, + "grad_norm": 0.750262113745714, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 22634 + }, + { + "epoch": 0.22635, + "grad_norm": 0.743952291464596, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 22635 + }, + { + "epoch": 0.22636, + "grad_norm": 0.808419510931088, + "learning_rate": 0.003, + "loss": 4.054, + "step": 22636 + }, + { + "epoch": 0.22637, + "grad_norm": 0.7620130853712486, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 22637 + }, + { + "epoch": 0.22638, + "grad_norm": 0.694338324513765, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 22638 + }, + { + "epoch": 0.22639, + "grad_norm": 0.7550333084737811, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 22639 + }, + { + "epoch": 0.2264, + "grad_norm": 0.8525863573766842, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 22640 + }, + { + "epoch": 0.22641, + "grad_norm": 1.0062752351595612, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 22641 + }, + { + "epoch": 0.22642, + "grad_norm": 1.0537390949528482, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 22642 + }, + { + "epoch": 0.22643, + "grad_norm": 1.0025652451117242, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 22643 + }, + { + "epoch": 0.22644, + "grad_norm": 0.96830712391237, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 22644 + }, + { + "epoch": 0.22645, + "grad_norm": 1.034363807063051, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 22645 + }, + { + "epoch": 0.22646, + "grad_norm": 1.1990352588546782, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 22646 + }, + { + "epoch": 0.22647, + "grad_norm": 1.0459371360253151, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 22647 + }, + { + "epoch": 0.22648, + "grad_norm": 1.0276827455692468, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 22648 + }, + { + "epoch": 0.22649, + "grad_norm": 0.9796795019469517, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 22649 + }, + { + "epoch": 0.2265, + "grad_norm": 0.8818432232629168, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 22650 + }, + { + "epoch": 0.22651, + "grad_norm": 0.628120121846171, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 22651 + }, + { + "epoch": 0.22652, + "grad_norm": 0.6145968841593442, + "learning_rate": 0.003, + "loss": 4.065, + "step": 22652 + }, + { + "epoch": 0.22653, + "grad_norm": 0.720373343094132, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 22653 + }, + { + "epoch": 0.22654, + "grad_norm": 0.7499659342626266, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 22654 + }, + { + "epoch": 0.22655, + "grad_norm": 0.7234087522757889, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 22655 + }, + { + "epoch": 0.22656, + "grad_norm": 0.7545664706458002, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 22656 + }, + { + "epoch": 0.22657, + "grad_norm": 0.900128869164829, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 22657 + }, + { + "epoch": 0.22658, + "grad_norm": 0.9276287783009832, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 22658 + }, + { + "epoch": 0.22659, + "grad_norm": 0.8283135664338659, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 22659 + }, + { + "epoch": 0.2266, + "grad_norm": 0.8789792512460451, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 22660 + }, + { + "epoch": 0.22661, + "grad_norm": 1.0052539515852346, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 22661 + }, + { + "epoch": 0.22662, + "grad_norm": 1.080767698072948, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 22662 + }, + { + "epoch": 0.22663, + "grad_norm": 0.8611203467525144, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 22663 + }, + { + "epoch": 0.22664, + "grad_norm": 0.9449416565356623, + "learning_rate": 0.003, + "loss": 4.06, + "step": 22664 + }, + { + "epoch": 0.22665, + "grad_norm": 0.971622178537138, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 22665 + }, + { + "epoch": 0.22666, + "grad_norm": 1.0356852569858708, + "learning_rate": 0.003, + "loss": 4.08, + "step": 22666 + }, + { + "epoch": 0.22667, + "grad_norm": 1.007553276766393, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 22667 + }, + { + "epoch": 0.22668, + "grad_norm": 0.9174707531427051, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 22668 + }, + { + "epoch": 0.22669, + "grad_norm": 0.8425523705853986, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 22669 + }, + { + "epoch": 0.2267, + "grad_norm": 0.7941694785805898, + "learning_rate": 0.003, + "loss": 4.048, + "step": 22670 + }, + { + "epoch": 0.22671, + "grad_norm": 0.7749838193621441, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 22671 + }, + { + "epoch": 0.22672, + "grad_norm": 0.7918445039759081, + "learning_rate": 0.003, + "loss": 4.056, + "step": 22672 + }, + { + "epoch": 0.22673, + "grad_norm": 0.8646885110451289, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 22673 + }, + { + "epoch": 0.22674, + "grad_norm": 0.8345245611150793, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 22674 + }, + { + "epoch": 0.22675, + "grad_norm": 0.7975428682059714, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 22675 + }, + { + "epoch": 0.22676, + "grad_norm": 0.9170779721343335, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 22676 + }, + { + "epoch": 0.22677, + "grad_norm": 1.0018992931708357, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 22677 + }, + { + "epoch": 0.22678, + "grad_norm": 0.9259278629586358, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 22678 + }, + { + "epoch": 0.22679, + "grad_norm": 0.9581009692431548, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 22679 + }, + { + "epoch": 0.2268, + "grad_norm": 1.1510095775516627, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 22680 + }, + { + "epoch": 0.22681, + "grad_norm": 0.9866108478374495, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 22681 + }, + { + "epoch": 0.22682, + "grad_norm": 0.9830469804042229, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 22682 + }, + { + "epoch": 0.22683, + "grad_norm": 1.0039065982777216, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 22683 + }, + { + "epoch": 0.22684, + "grad_norm": 0.9832268895434356, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 22684 + }, + { + "epoch": 0.22685, + "grad_norm": 0.947447633492396, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 22685 + }, + { + "epoch": 0.22686, + "grad_norm": 0.8837451486182951, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 22686 + }, + { + "epoch": 0.22687, + "grad_norm": 0.8565653017539905, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 22687 + }, + { + "epoch": 0.22688, + "grad_norm": 1.0104639017512123, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 22688 + }, + { + "epoch": 0.22689, + "grad_norm": 0.975114170922442, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 22689 + }, + { + "epoch": 0.2269, + "grad_norm": 0.8449339228154968, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 22690 + }, + { + "epoch": 0.22691, + "grad_norm": 0.8151313485000596, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 22691 + }, + { + "epoch": 0.22692, + "grad_norm": 0.8162244058539979, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 22692 + }, + { + "epoch": 0.22693, + "grad_norm": 0.7488576404896917, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 22693 + }, + { + "epoch": 0.22694, + "grad_norm": 0.7515467946755654, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 22694 + }, + { + "epoch": 0.22695, + "grad_norm": 0.7440309682747194, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 22695 + }, + { + "epoch": 0.22696, + "grad_norm": 0.6645082399531839, + "learning_rate": 0.003, + "loss": 4.046, + "step": 22696 + }, + { + "epoch": 0.22697, + "grad_norm": 0.6656852626486821, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 22697 + }, + { + "epoch": 0.22698, + "grad_norm": 0.5735199010982336, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 22698 + }, + { + "epoch": 0.22699, + "grad_norm": 0.6224573279349771, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 22699 + }, + { + "epoch": 0.227, + "grad_norm": 0.6333014785989532, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 22700 + }, + { + "epoch": 0.22701, + "grad_norm": 0.6920123584011759, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 22701 + }, + { + "epoch": 0.22702, + "grad_norm": 0.747329756271678, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 22702 + }, + { + "epoch": 0.22703, + "grad_norm": 0.7110852928129989, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 22703 + }, + { + "epoch": 0.22704, + "grad_norm": 0.7165391520100443, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 22704 + }, + { + "epoch": 0.22705, + "grad_norm": 0.7540745466164605, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 22705 + }, + { + "epoch": 0.22706, + "grad_norm": 0.8697672264851807, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 22706 + }, + { + "epoch": 0.22707, + "grad_norm": 1.183111399923685, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 22707 + }, + { + "epoch": 0.22708, + "grad_norm": 1.1491430180039552, + "learning_rate": 0.003, + "loss": 4.0087, + "step": 22708 + }, + { + "epoch": 0.22709, + "grad_norm": 0.972757390350557, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 22709 + }, + { + "epoch": 0.2271, + "grad_norm": 0.968494013863733, + "learning_rate": 0.003, + "loss": 4.04, + "step": 22710 + }, + { + "epoch": 0.22711, + "grad_norm": 1.0520071655171688, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 22711 + }, + { + "epoch": 0.22712, + "grad_norm": 0.8910486609978872, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 22712 + }, + { + "epoch": 0.22713, + "grad_norm": 0.7945312930938495, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 22713 + }, + { + "epoch": 0.22714, + "grad_norm": 0.7182497839785165, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 22714 + }, + { + "epoch": 0.22715, + "grad_norm": 0.7069488420876577, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 22715 + }, + { + "epoch": 0.22716, + "grad_norm": 0.7980211387490869, + "learning_rate": 0.003, + "loss": 4.044, + "step": 22716 + }, + { + "epoch": 0.22717, + "grad_norm": 0.9048879610739127, + "learning_rate": 0.003, + "loss": 4.077, + "step": 22717 + }, + { + "epoch": 0.22718, + "grad_norm": 0.8602498474925933, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 22718 + }, + { + "epoch": 0.22719, + "grad_norm": 0.8482471930463336, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 22719 + }, + { + "epoch": 0.2272, + "grad_norm": 0.8133784578779553, + "learning_rate": 0.003, + "loss": 4.058, + "step": 22720 + }, + { + "epoch": 0.22721, + "grad_norm": 0.8104731653097466, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 22721 + }, + { + "epoch": 0.22722, + "grad_norm": 0.8118707964996753, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 22722 + }, + { + "epoch": 0.22723, + "grad_norm": 0.8861620894199845, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 22723 + }, + { + "epoch": 0.22724, + "grad_norm": 1.079175913332311, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 22724 + }, + { + "epoch": 0.22725, + "grad_norm": 1.2562039862360328, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 22725 + }, + { + "epoch": 0.22726, + "grad_norm": 0.8492751766680512, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 22726 + }, + { + "epoch": 0.22727, + "grad_norm": 0.904075217164488, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 22727 + }, + { + "epoch": 0.22728, + "grad_norm": 0.8414714997922506, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 22728 + }, + { + "epoch": 0.22729, + "grad_norm": 0.906962799489872, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 22729 + }, + { + "epoch": 0.2273, + "grad_norm": 0.9967874821560386, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 22730 + }, + { + "epoch": 0.22731, + "grad_norm": 1.1586205386995512, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 22731 + }, + { + "epoch": 0.22732, + "grad_norm": 1.0489328409640646, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 22732 + }, + { + "epoch": 0.22733, + "grad_norm": 0.9610063968691808, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 22733 + }, + { + "epoch": 0.22734, + "grad_norm": 0.909430450755612, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 22734 + }, + { + "epoch": 0.22735, + "grad_norm": 0.9897769424042009, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 22735 + }, + { + "epoch": 0.22736, + "grad_norm": 0.970753056237132, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 22736 + }, + { + "epoch": 0.22737, + "grad_norm": 1.0260003649404208, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 22737 + }, + { + "epoch": 0.22738, + "grad_norm": 1.1877661369093184, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 22738 + }, + { + "epoch": 0.22739, + "grad_norm": 0.9347650162164082, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 22739 + }, + { + "epoch": 0.2274, + "grad_norm": 0.9393559041480698, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 22740 + }, + { + "epoch": 0.22741, + "grad_norm": 0.9967223469859002, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 22741 + }, + { + "epoch": 0.22742, + "grad_norm": 0.9272640767979677, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 22742 + }, + { + "epoch": 0.22743, + "grad_norm": 0.8808696694984538, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 22743 + }, + { + "epoch": 0.22744, + "grad_norm": 0.8754452913224413, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 22744 + }, + { + "epoch": 0.22745, + "grad_norm": 0.7880741554440877, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 22745 + }, + { + "epoch": 0.22746, + "grad_norm": 0.7523070895746822, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 22746 + }, + { + "epoch": 0.22747, + "grad_norm": 0.6716616970521917, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 22747 + }, + { + "epoch": 0.22748, + "grad_norm": 0.6078743567779584, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 22748 + }, + { + "epoch": 0.22749, + "grad_norm": 0.7269726867285148, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 22749 + }, + { + "epoch": 0.2275, + "grad_norm": 0.9344885317325508, + "learning_rate": 0.003, + "loss": 4.046, + "step": 22750 + }, + { + "epoch": 0.22751, + "grad_norm": 1.1503680403094034, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 22751 + }, + { + "epoch": 0.22752, + "grad_norm": 0.8700409231841962, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 22752 + }, + { + "epoch": 0.22753, + "grad_norm": 0.9502787901091739, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 22753 + }, + { + "epoch": 0.22754, + "grad_norm": 1.2170716227290153, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 22754 + }, + { + "epoch": 0.22755, + "grad_norm": 1.001586415329349, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 22755 + }, + { + "epoch": 0.22756, + "grad_norm": 0.9073302937086049, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 22756 + }, + { + "epoch": 0.22757, + "grad_norm": 0.8481127690300688, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 22757 + }, + { + "epoch": 0.22758, + "grad_norm": 0.9188490433164043, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 22758 + }, + { + "epoch": 0.22759, + "grad_norm": 0.862118745179981, + "learning_rate": 0.003, + "loss": 4.044, + "step": 22759 + }, + { + "epoch": 0.2276, + "grad_norm": 0.8769824496909433, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 22760 + }, + { + "epoch": 0.22761, + "grad_norm": 0.9013392694072248, + "learning_rate": 0.003, + "loss": 4.064, + "step": 22761 + }, + { + "epoch": 0.22762, + "grad_norm": 0.9309138302753703, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 22762 + }, + { + "epoch": 0.22763, + "grad_norm": 0.949413238124462, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 22763 + }, + { + "epoch": 0.22764, + "grad_norm": 0.9512018871828793, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 22764 + }, + { + "epoch": 0.22765, + "grad_norm": 0.8689322660909483, + "learning_rate": 0.003, + "loss": 4.04, + "step": 22765 + }, + { + "epoch": 0.22766, + "grad_norm": 0.8444804983371068, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 22766 + }, + { + "epoch": 0.22767, + "grad_norm": 1.080413165072527, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 22767 + }, + { + "epoch": 0.22768, + "grad_norm": 0.9088416420914456, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 22768 + }, + { + "epoch": 0.22769, + "grad_norm": 0.799525837915491, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 22769 + }, + { + "epoch": 0.2277, + "grad_norm": 0.7699723538144105, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 22770 + }, + { + "epoch": 0.22771, + "grad_norm": 0.8143142554392547, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 22771 + }, + { + "epoch": 0.22772, + "grad_norm": 0.9274083261025904, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 22772 + }, + { + "epoch": 0.22773, + "grad_norm": 0.984238176883308, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 22773 + }, + { + "epoch": 0.22774, + "grad_norm": 0.9554511641495651, + "learning_rate": 0.003, + "loss": 4.029, + "step": 22774 + }, + { + "epoch": 0.22775, + "grad_norm": 0.7928942084926369, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 22775 + }, + { + "epoch": 0.22776, + "grad_norm": 0.7207440721741104, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 22776 + }, + { + "epoch": 0.22777, + "grad_norm": 0.6623191295748422, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 22777 + }, + { + "epoch": 0.22778, + "grad_norm": 0.5898935569872705, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 22778 + }, + { + "epoch": 0.22779, + "grad_norm": 0.608286460676412, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 22779 + }, + { + "epoch": 0.2278, + "grad_norm": 0.7065719510964161, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 22780 + }, + { + "epoch": 0.22781, + "grad_norm": 0.8759375015149959, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 22781 + }, + { + "epoch": 0.22782, + "grad_norm": 1.0701114191785137, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 22782 + }, + { + "epoch": 0.22783, + "grad_norm": 1.0109689899197118, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 22783 + }, + { + "epoch": 0.22784, + "grad_norm": 1.1001918809786848, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 22784 + }, + { + "epoch": 0.22785, + "grad_norm": 1.0651519419042537, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 22785 + }, + { + "epoch": 0.22786, + "grad_norm": 0.8809097220494033, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 22786 + }, + { + "epoch": 0.22787, + "grad_norm": 0.6721397554264887, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 22787 + }, + { + "epoch": 0.22788, + "grad_norm": 0.5252983629838026, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 22788 + }, + { + "epoch": 0.22789, + "grad_norm": 0.6071835354482025, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 22789 + }, + { + "epoch": 0.2279, + "grad_norm": 0.7097765682495768, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 22790 + }, + { + "epoch": 0.22791, + "grad_norm": 0.8122285027012834, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 22791 + }, + { + "epoch": 0.22792, + "grad_norm": 0.8741104437717017, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 22792 + }, + { + "epoch": 0.22793, + "grad_norm": 0.7546144575435845, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 22793 + }, + { + "epoch": 0.22794, + "grad_norm": 0.8617384188138572, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 22794 + }, + { + "epoch": 0.22795, + "grad_norm": 1.0403403332749075, + "learning_rate": 0.003, + "loss": 4.106, + "step": 22795 + }, + { + "epoch": 0.22796, + "grad_norm": 1.1477340056332108, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 22796 + }, + { + "epoch": 0.22797, + "grad_norm": 0.9891245123643607, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 22797 + }, + { + "epoch": 0.22798, + "grad_norm": 1.0551842642330131, + "learning_rate": 0.003, + "loss": 4.074, + "step": 22798 + }, + { + "epoch": 0.22799, + "grad_norm": 1.0552942759567618, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 22799 + }, + { + "epoch": 0.228, + "grad_norm": 1.009389497514096, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 22800 + }, + { + "epoch": 0.22801, + "grad_norm": 1.1548037553780226, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 22801 + }, + { + "epoch": 0.22802, + "grad_norm": 0.843224546999332, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 22802 + }, + { + "epoch": 0.22803, + "grad_norm": 0.8061515141629308, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 22803 + }, + { + "epoch": 0.22804, + "grad_norm": 0.7573372806484121, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 22804 + }, + { + "epoch": 0.22805, + "grad_norm": 0.6828518090113931, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 22805 + }, + { + "epoch": 0.22806, + "grad_norm": 0.666160973032393, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 22806 + }, + { + "epoch": 0.22807, + "grad_norm": 0.6534853359153482, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 22807 + }, + { + "epoch": 0.22808, + "grad_norm": 0.694400653267406, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 22808 + }, + { + "epoch": 0.22809, + "grad_norm": 0.6626020941319344, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 22809 + }, + { + "epoch": 0.2281, + "grad_norm": 0.5981677241867265, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 22810 + }, + { + "epoch": 0.22811, + "grad_norm": 0.6691449669962832, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 22811 + }, + { + "epoch": 0.22812, + "grad_norm": 0.772816656462761, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 22812 + }, + { + "epoch": 0.22813, + "grad_norm": 0.9377318248938178, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 22813 + }, + { + "epoch": 0.22814, + "grad_norm": 1.2412529653321838, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 22814 + }, + { + "epoch": 0.22815, + "grad_norm": 0.749202066133807, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 22815 + }, + { + "epoch": 0.22816, + "grad_norm": 0.7501119635213498, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 22816 + }, + { + "epoch": 0.22817, + "grad_norm": 0.7675679163994044, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 22817 + }, + { + "epoch": 0.22818, + "grad_norm": 0.8433282719370441, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 22818 + }, + { + "epoch": 0.22819, + "grad_norm": 0.941368095384892, + "learning_rate": 0.003, + "loss": 4.034, + "step": 22819 + }, + { + "epoch": 0.2282, + "grad_norm": 1.1336983836760979, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 22820 + }, + { + "epoch": 0.22821, + "grad_norm": 1.0208848351092374, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 22821 + }, + { + "epoch": 0.22822, + "grad_norm": 1.1913258834969345, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 22822 + }, + { + "epoch": 0.22823, + "grad_norm": 0.8920054317702796, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 22823 + }, + { + "epoch": 0.22824, + "grad_norm": 0.8388889322992132, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 22824 + }, + { + "epoch": 0.22825, + "grad_norm": 0.8141244000835429, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 22825 + }, + { + "epoch": 0.22826, + "grad_norm": 0.8139046223127696, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 22826 + }, + { + "epoch": 0.22827, + "grad_norm": 0.9174284630998599, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 22827 + }, + { + "epoch": 0.22828, + "grad_norm": 0.8838765846096129, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 22828 + }, + { + "epoch": 0.22829, + "grad_norm": 0.8461518621141414, + "learning_rate": 0.003, + "loss": 4.027, + "step": 22829 + }, + { + "epoch": 0.2283, + "grad_norm": 0.8930878887878086, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 22830 + }, + { + "epoch": 0.22831, + "grad_norm": 1.1795609722644327, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 22831 + }, + { + "epoch": 0.22832, + "grad_norm": 0.9880840005024115, + "learning_rate": 0.003, + "loss": 4.061, + "step": 22832 + }, + { + "epoch": 0.22833, + "grad_norm": 0.8639869942846465, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 22833 + }, + { + "epoch": 0.22834, + "grad_norm": 0.9211993329776144, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 22834 + }, + { + "epoch": 0.22835, + "grad_norm": 1.0468431059164391, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 22835 + }, + { + "epoch": 0.22836, + "grad_norm": 1.245325142310674, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 22836 + }, + { + "epoch": 0.22837, + "grad_norm": 0.8997614990021262, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 22837 + }, + { + "epoch": 0.22838, + "grad_norm": 0.7223012100017461, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 22838 + }, + { + "epoch": 0.22839, + "grad_norm": 0.7114146903458733, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 22839 + }, + { + "epoch": 0.2284, + "grad_norm": 0.7968502569647606, + "learning_rate": 0.003, + "loss": 4.053, + "step": 22840 + }, + { + "epoch": 0.22841, + "grad_norm": 1.0252895278679306, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 22841 + }, + { + "epoch": 0.22842, + "grad_norm": 1.1009154227110782, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 22842 + }, + { + "epoch": 0.22843, + "grad_norm": 0.9348037932749858, + "learning_rate": 0.003, + "loss": 4.062, + "step": 22843 + }, + { + "epoch": 0.22844, + "grad_norm": 1.0187018353456725, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 22844 + }, + { + "epoch": 0.22845, + "grad_norm": 1.0757222856196504, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 22845 + }, + { + "epoch": 0.22846, + "grad_norm": 0.9321190629288321, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 22846 + }, + { + "epoch": 0.22847, + "grad_norm": 1.1090680297543973, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 22847 + }, + { + "epoch": 0.22848, + "grad_norm": 1.0260494042912045, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 22848 + }, + { + "epoch": 0.22849, + "grad_norm": 1.0234148870836144, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 22849 + }, + { + "epoch": 0.2285, + "grad_norm": 1.0063410273604398, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 22850 + }, + { + "epoch": 0.22851, + "grad_norm": 0.9518465127345284, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 22851 + }, + { + "epoch": 0.22852, + "grad_norm": 0.8896919039899941, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 22852 + }, + { + "epoch": 0.22853, + "grad_norm": 0.970127617852437, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 22853 + }, + { + "epoch": 0.22854, + "grad_norm": 1.1040297130814274, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 22854 + }, + { + "epoch": 0.22855, + "grad_norm": 1.1272146921961455, + "learning_rate": 0.003, + "loss": 4.098, + "step": 22855 + }, + { + "epoch": 0.22856, + "grad_norm": 0.9025382692018051, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 22856 + }, + { + "epoch": 0.22857, + "grad_norm": 0.9756305715682522, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 22857 + }, + { + "epoch": 0.22858, + "grad_norm": 1.1889158222663803, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 22858 + }, + { + "epoch": 0.22859, + "grad_norm": 0.8569899154104883, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 22859 + }, + { + "epoch": 0.2286, + "grad_norm": 0.858128212011132, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 22860 + }, + { + "epoch": 0.22861, + "grad_norm": 0.8459227504132819, + "learning_rate": 0.003, + "loss": 4.042, + "step": 22861 + }, + { + "epoch": 0.22862, + "grad_norm": 0.7765843307811464, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 22862 + }, + { + "epoch": 0.22863, + "grad_norm": 0.7706817304234865, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 22863 + }, + { + "epoch": 0.22864, + "grad_norm": 0.7255600341445928, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 22864 + }, + { + "epoch": 0.22865, + "grad_norm": 0.6813917713241208, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 22865 + }, + { + "epoch": 0.22866, + "grad_norm": 0.5738037421381981, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 22866 + }, + { + "epoch": 0.22867, + "grad_norm": 0.5507813134036691, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 22867 + }, + { + "epoch": 0.22868, + "grad_norm": 0.6832369511215274, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 22868 + }, + { + "epoch": 0.22869, + "grad_norm": 1.1055537764289742, + "learning_rate": 0.003, + "loss": 4.061, + "step": 22869 + }, + { + "epoch": 0.2287, + "grad_norm": 1.2224045301249717, + "learning_rate": 0.003, + "loss": 4.055, + "step": 22870 + }, + { + "epoch": 0.22871, + "grad_norm": 0.6513004531331329, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 22871 + }, + { + "epoch": 0.22872, + "grad_norm": 0.6151857890607966, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 22872 + }, + { + "epoch": 0.22873, + "grad_norm": 0.7166660684404108, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 22873 + }, + { + "epoch": 0.22874, + "grad_norm": 0.6904437340497975, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 22874 + }, + { + "epoch": 0.22875, + "grad_norm": 0.6631850554355596, + "learning_rate": 0.003, + "loss": 4.0074, + "step": 22875 + }, + { + "epoch": 0.22876, + "grad_norm": 0.7179607912734198, + "learning_rate": 0.003, + "loss": 4.0067, + "step": 22876 + }, + { + "epoch": 0.22877, + "grad_norm": 0.9976481130477168, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 22877 + }, + { + "epoch": 0.22878, + "grad_norm": 1.300526202817369, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 22878 + }, + { + "epoch": 0.22879, + "grad_norm": 0.6375483840950272, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 22879 + }, + { + "epoch": 0.2288, + "grad_norm": 0.7579041704260526, + "learning_rate": 0.003, + "loss": 4.041, + "step": 22880 + }, + { + "epoch": 0.22881, + "grad_norm": 0.8667024385310933, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 22881 + }, + { + "epoch": 0.22882, + "grad_norm": 0.9157192582314859, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 22882 + }, + { + "epoch": 0.22883, + "grad_norm": 0.9356479890036767, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 22883 + }, + { + "epoch": 0.22884, + "grad_norm": 0.8954344958456766, + "learning_rate": 0.003, + "loss": 4.029, + "step": 22884 + }, + { + "epoch": 0.22885, + "grad_norm": 0.9041830458560043, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 22885 + }, + { + "epoch": 0.22886, + "grad_norm": 1.0603001011635644, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 22886 + }, + { + "epoch": 0.22887, + "grad_norm": 1.1936793786653745, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 22887 + }, + { + "epoch": 0.22888, + "grad_norm": 0.7928966771319186, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 22888 + }, + { + "epoch": 0.22889, + "grad_norm": 0.7250793645787325, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 22889 + }, + { + "epoch": 0.2289, + "grad_norm": 0.6459945149022985, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 22890 + }, + { + "epoch": 0.22891, + "grad_norm": 0.644664663994461, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 22891 + }, + { + "epoch": 0.22892, + "grad_norm": 0.7146749073703039, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 22892 + }, + { + "epoch": 0.22893, + "grad_norm": 0.8059523448770808, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 22893 + }, + { + "epoch": 0.22894, + "grad_norm": 0.8299869209530397, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 22894 + }, + { + "epoch": 0.22895, + "grad_norm": 0.791062985100653, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 22895 + }, + { + "epoch": 0.22896, + "grad_norm": 0.9253667717221834, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 22896 + }, + { + "epoch": 0.22897, + "grad_norm": 0.916825194225468, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 22897 + }, + { + "epoch": 0.22898, + "grad_norm": 0.900905856485575, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 22898 + }, + { + "epoch": 0.22899, + "grad_norm": 0.9855931961412255, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 22899 + }, + { + "epoch": 0.229, + "grad_norm": 1.0862993704192567, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 22900 + }, + { + "epoch": 0.22901, + "grad_norm": 0.9833426033059771, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 22901 + }, + { + "epoch": 0.22902, + "grad_norm": 0.9626526031839022, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 22902 + }, + { + "epoch": 0.22903, + "grad_norm": 0.87181432723415, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 22903 + }, + { + "epoch": 0.22904, + "grad_norm": 0.9400562772563169, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 22904 + }, + { + "epoch": 0.22905, + "grad_norm": 0.9984794930427514, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 22905 + }, + { + "epoch": 0.22906, + "grad_norm": 1.002218107887175, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 22906 + }, + { + "epoch": 0.22907, + "grad_norm": 0.9102772936374971, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 22907 + }, + { + "epoch": 0.22908, + "grad_norm": 1.1172984967001958, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 22908 + }, + { + "epoch": 0.22909, + "grad_norm": 1.1244459021174908, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 22909 + }, + { + "epoch": 0.2291, + "grad_norm": 1.163742705238034, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 22910 + }, + { + "epoch": 0.22911, + "grad_norm": 0.8542478953851748, + "learning_rate": 0.003, + "loss": 4.043, + "step": 22911 + }, + { + "epoch": 0.22912, + "grad_norm": 0.8479751087676674, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 22912 + }, + { + "epoch": 0.22913, + "grad_norm": 1.0237665976024057, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 22913 + }, + { + "epoch": 0.22914, + "grad_norm": 1.0245501065829639, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 22914 + }, + { + "epoch": 0.22915, + "grad_norm": 0.905597520989836, + "learning_rate": 0.003, + "loss": 4.063, + "step": 22915 + }, + { + "epoch": 0.22916, + "grad_norm": 0.8452178131475603, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 22916 + }, + { + "epoch": 0.22917, + "grad_norm": 0.7676683649543508, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 22917 + }, + { + "epoch": 0.22918, + "grad_norm": 0.8347297353087648, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 22918 + }, + { + "epoch": 0.22919, + "grad_norm": 0.9238730428877724, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 22919 + }, + { + "epoch": 0.2292, + "grad_norm": 0.9948638665174948, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 22920 + }, + { + "epoch": 0.22921, + "grad_norm": 1.0204471285302084, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 22921 + }, + { + "epoch": 0.22922, + "grad_norm": 1.0567623779721371, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 22922 + }, + { + "epoch": 0.22923, + "grad_norm": 0.8598295176574143, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 22923 + }, + { + "epoch": 0.22924, + "grad_norm": 0.7410298075784536, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 22924 + }, + { + "epoch": 0.22925, + "grad_norm": 0.6810413229843065, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 22925 + }, + { + "epoch": 0.22926, + "grad_norm": 0.6324416583182618, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 22926 + }, + { + "epoch": 0.22927, + "grad_norm": 0.6668615354737717, + "learning_rate": 0.003, + "loss": 4.055, + "step": 22927 + }, + { + "epoch": 0.22928, + "grad_norm": 0.7976919923272611, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 22928 + }, + { + "epoch": 0.22929, + "grad_norm": 0.8562862406451046, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 22929 + }, + { + "epoch": 0.2293, + "grad_norm": 0.9892108356188348, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 22930 + }, + { + "epoch": 0.22931, + "grad_norm": 0.9879965267580636, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 22931 + }, + { + "epoch": 0.22932, + "grad_norm": 0.8771048374138064, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 22932 + }, + { + "epoch": 0.22933, + "grad_norm": 0.8392038939464274, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 22933 + }, + { + "epoch": 0.22934, + "grad_norm": 0.8252270106607972, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 22934 + }, + { + "epoch": 0.22935, + "grad_norm": 0.8725316834366568, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 22935 + }, + { + "epoch": 0.22936, + "grad_norm": 0.9484097659068086, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 22936 + }, + { + "epoch": 0.22937, + "grad_norm": 1.339929854033913, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 22937 + }, + { + "epoch": 0.22938, + "grad_norm": 0.9453912054839796, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 22938 + }, + { + "epoch": 0.22939, + "grad_norm": 0.9252437224548369, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 22939 + }, + { + "epoch": 0.2294, + "grad_norm": 0.8626464385891859, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 22940 + }, + { + "epoch": 0.22941, + "grad_norm": 0.9741466563509095, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 22941 + }, + { + "epoch": 0.22942, + "grad_norm": 1.035526456224067, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 22942 + }, + { + "epoch": 0.22943, + "grad_norm": 0.9534015075296856, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 22943 + }, + { + "epoch": 0.22944, + "grad_norm": 0.9775281928913955, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 22944 + }, + { + "epoch": 0.22945, + "grad_norm": 1.0681686522829308, + "learning_rate": 0.003, + "loss": 4.055, + "step": 22945 + }, + { + "epoch": 0.22946, + "grad_norm": 1.0443875198340178, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 22946 + }, + { + "epoch": 0.22947, + "grad_norm": 0.9303181436743452, + "learning_rate": 0.003, + "loss": 4.06, + "step": 22947 + }, + { + "epoch": 0.22948, + "grad_norm": 0.8805204069071085, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 22948 + }, + { + "epoch": 0.22949, + "grad_norm": 0.8073268543094761, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 22949 + }, + { + "epoch": 0.2295, + "grad_norm": 0.7990491002563296, + "learning_rate": 0.003, + "loss": 4.057, + "step": 22950 + }, + { + "epoch": 0.22951, + "grad_norm": 0.7990195049561448, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 22951 + }, + { + "epoch": 0.22952, + "grad_norm": 0.8028268297177463, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 22952 + }, + { + "epoch": 0.22953, + "grad_norm": 0.7812014546426375, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 22953 + }, + { + "epoch": 0.22954, + "grad_norm": 0.7896531312946842, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 22954 + }, + { + "epoch": 0.22955, + "grad_norm": 0.7152243663377584, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 22955 + }, + { + "epoch": 0.22956, + "grad_norm": 0.6466416504931088, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 22956 + }, + { + "epoch": 0.22957, + "grad_norm": 0.6699337604433011, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 22957 + }, + { + "epoch": 0.22958, + "grad_norm": 0.5800195096240598, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 22958 + }, + { + "epoch": 0.22959, + "grad_norm": 0.6335042890350372, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 22959 + }, + { + "epoch": 0.2296, + "grad_norm": 0.581973903016988, + "learning_rate": 0.003, + "loss": 3.9851, + "step": 22960 + }, + { + "epoch": 0.22961, + "grad_norm": 0.5812413821469002, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 22961 + }, + { + "epoch": 0.22962, + "grad_norm": 0.6755407643154575, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 22962 + }, + { + "epoch": 0.22963, + "grad_norm": 0.723218674105609, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 22963 + }, + { + "epoch": 0.22964, + "grad_norm": 0.8057990456142935, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 22964 + }, + { + "epoch": 0.22965, + "grad_norm": 1.0252109799460332, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 22965 + }, + { + "epoch": 0.22966, + "grad_norm": 1.3427812705845885, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 22966 + }, + { + "epoch": 0.22967, + "grad_norm": 0.8797474436401552, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 22967 + }, + { + "epoch": 0.22968, + "grad_norm": 0.9576048558955592, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 22968 + }, + { + "epoch": 0.22969, + "grad_norm": 0.944415388266654, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 22969 + }, + { + "epoch": 0.2297, + "grad_norm": 1.1375977085296949, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 22970 + }, + { + "epoch": 0.22971, + "grad_norm": 0.9591753880489413, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 22971 + }, + { + "epoch": 0.22972, + "grad_norm": 0.8148959203882464, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 22972 + }, + { + "epoch": 0.22973, + "grad_norm": 0.7743403967676066, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 22973 + }, + { + "epoch": 0.22974, + "grad_norm": 0.8639527765579142, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 22974 + }, + { + "epoch": 0.22975, + "grad_norm": 1.0442579089795847, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 22975 + }, + { + "epoch": 0.22976, + "grad_norm": 1.0369976250361848, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 22976 + }, + { + "epoch": 0.22977, + "grad_norm": 1.109288992260987, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 22977 + }, + { + "epoch": 0.22978, + "grad_norm": 1.1558766156710585, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 22978 + }, + { + "epoch": 0.22979, + "grad_norm": 1.0285031591188565, + "learning_rate": 0.003, + "loss": 4.059, + "step": 22979 + }, + { + "epoch": 0.2298, + "grad_norm": 0.9182114991127589, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 22980 + }, + { + "epoch": 0.22981, + "grad_norm": 0.8586159639490722, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 22981 + }, + { + "epoch": 0.22982, + "grad_norm": 0.7320856724910753, + "learning_rate": 0.003, + "loss": 4.049, + "step": 22982 + }, + { + "epoch": 0.22983, + "grad_norm": 0.7573053317615691, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 22983 + }, + { + "epoch": 0.22984, + "grad_norm": 0.7364799613121218, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 22984 + }, + { + "epoch": 0.22985, + "grad_norm": 0.7517083418231769, + "learning_rate": 0.003, + "loss": 4.048, + "step": 22985 + }, + { + "epoch": 0.22986, + "grad_norm": 0.7121643498601627, + "learning_rate": 0.003, + "loss": 4.027, + "step": 22986 + }, + { + "epoch": 0.22987, + "grad_norm": 0.7442075739463677, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 22987 + }, + { + "epoch": 0.22988, + "grad_norm": 0.7675063175417414, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 22988 + }, + { + "epoch": 0.22989, + "grad_norm": 0.9094691556004088, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 22989 + }, + { + "epoch": 0.2299, + "grad_norm": 0.9897633770199264, + "learning_rate": 0.003, + "loss": 4.055, + "step": 22990 + }, + { + "epoch": 0.22991, + "grad_norm": 1.1205124593215388, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 22991 + }, + { + "epoch": 0.22992, + "grad_norm": 1.069519546680366, + "learning_rate": 0.003, + "loss": 4.044, + "step": 22992 + }, + { + "epoch": 0.22993, + "grad_norm": 1.052990148523248, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 22993 + }, + { + "epoch": 0.22994, + "grad_norm": 1.0578793869125982, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 22994 + }, + { + "epoch": 0.22995, + "grad_norm": 1.1542369296925874, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 22995 + }, + { + "epoch": 0.22996, + "grad_norm": 1.0761176648478468, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 22996 + }, + { + "epoch": 0.22997, + "grad_norm": 1.0180931606093209, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 22997 + }, + { + "epoch": 0.22998, + "grad_norm": 0.9458943977593038, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 22998 + }, + { + "epoch": 0.22999, + "grad_norm": 0.8220670512733281, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 22999 + }, + { + "epoch": 0.23, + "grad_norm": 0.6440144646518966, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 23000 + }, + { + "epoch": 0.23001, + "grad_norm": 0.5755514750379364, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 23001 + }, + { + "epoch": 0.23002, + "grad_norm": 0.6407383692328728, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 23002 + }, + { + "epoch": 0.23003, + "grad_norm": 0.7322045865591256, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 23003 + }, + { + "epoch": 0.23004, + "grad_norm": 0.912968524887482, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 23004 + }, + { + "epoch": 0.23005, + "grad_norm": 1.1448046014698365, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 23005 + }, + { + "epoch": 0.23006, + "grad_norm": 0.9912597420142032, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 23006 + }, + { + "epoch": 0.23007, + "grad_norm": 0.9797687843984587, + "learning_rate": 0.003, + "loss": 4.046, + "step": 23007 + }, + { + "epoch": 0.23008, + "grad_norm": 0.8562928902048835, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 23008 + }, + { + "epoch": 0.23009, + "grad_norm": 0.8442274961912204, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 23009 + }, + { + "epoch": 0.2301, + "grad_norm": 0.8947168340077016, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 23010 + }, + { + "epoch": 0.23011, + "grad_norm": 1.0647803346246902, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 23011 + }, + { + "epoch": 0.23012, + "grad_norm": 1.0482378466819748, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 23012 + }, + { + "epoch": 0.23013, + "grad_norm": 0.9665854253183856, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 23013 + }, + { + "epoch": 0.23014, + "grad_norm": 0.7918976868664909, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 23014 + }, + { + "epoch": 0.23015, + "grad_norm": 0.7504563422319043, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 23015 + }, + { + "epoch": 0.23016, + "grad_norm": 0.7846609882630923, + "learning_rate": 0.003, + "loss": 4.037, + "step": 23016 + }, + { + "epoch": 0.23017, + "grad_norm": 0.6953285756158426, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 23017 + }, + { + "epoch": 0.23018, + "grad_norm": 0.7182305490736426, + "learning_rate": 0.003, + "loss": 4.019, + "step": 23018 + }, + { + "epoch": 0.23019, + "grad_norm": 0.8230608160580406, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 23019 + }, + { + "epoch": 0.2302, + "grad_norm": 0.7621053002492446, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 23020 + }, + { + "epoch": 0.23021, + "grad_norm": 0.732343101266712, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 23021 + }, + { + "epoch": 0.23022, + "grad_norm": 0.6979325865328343, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 23022 + }, + { + "epoch": 0.23023, + "grad_norm": 0.6491460396937742, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 23023 + }, + { + "epoch": 0.23024, + "grad_norm": 0.7407802564372362, + "learning_rate": 0.003, + "loss": 4.0111, + "step": 23024 + }, + { + "epoch": 0.23025, + "grad_norm": 0.7129150741142731, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 23025 + }, + { + "epoch": 0.23026, + "grad_norm": 0.5648904117697976, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 23026 + }, + { + "epoch": 0.23027, + "grad_norm": 0.6313451167071286, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 23027 + }, + { + "epoch": 0.23028, + "grad_norm": 0.6918169092622712, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 23028 + }, + { + "epoch": 0.23029, + "grad_norm": 0.8804208104690182, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 23029 + }, + { + "epoch": 0.2303, + "grad_norm": 1.3311874724407622, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 23030 + }, + { + "epoch": 0.23031, + "grad_norm": 1.238694234393582, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 23031 + }, + { + "epoch": 0.23032, + "grad_norm": 0.8039568917642566, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 23032 + }, + { + "epoch": 0.23033, + "grad_norm": 0.7671679413513601, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 23033 + }, + { + "epoch": 0.23034, + "grad_norm": 0.7232051042304448, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 23034 + }, + { + "epoch": 0.23035, + "grad_norm": 0.67765558853382, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 23035 + }, + { + "epoch": 0.23036, + "grad_norm": 0.7468434783455861, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 23036 + }, + { + "epoch": 0.23037, + "grad_norm": 0.8810508270969094, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 23037 + }, + { + "epoch": 0.23038, + "grad_norm": 1.0703990078286738, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 23038 + }, + { + "epoch": 0.23039, + "grad_norm": 1.1634087956640424, + "learning_rate": 0.003, + "loss": 4.037, + "step": 23039 + }, + { + "epoch": 0.2304, + "grad_norm": 0.9932499531102147, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 23040 + }, + { + "epoch": 0.23041, + "grad_norm": 1.1867958003879997, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 23041 + }, + { + "epoch": 0.23042, + "grad_norm": 0.9442961368857763, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 23042 + }, + { + "epoch": 0.23043, + "grad_norm": 0.9531702275296813, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 23043 + }, + { + "epoch": 0.23044, + "grad_norm": 1.1339833255036662, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 23044 + }, + { + "epoch": 0.23045, + "grad_norm": 0.9114005226081583, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 23045 + }, + { + "epoch": 0.23046, + "grad_norm": 0.8518906522540243, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 23046 + }, + { + "epoch": 0.23047, + "grad_norm": 0.8229079410946365, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 23047 + }, + { + "epoch": 0.23048, + "grad_norm": 0.8242342866953422, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 23048 + }, + { + "epoch": 0.23049, + "grad_norm": 0.8072608110455313, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 23049 + }, + { + "epoch": 0.2305, + "grad_norm": 0.9430069625495672, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 23050 + }, + { + "epoch": 0.23051, + "grad_norm": 0.9796725003736887, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 23051 + }, + { + "epoch": 0.23052, + "grad_norm": 1.0099302599233693, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 23052 + }, + { + "epoch": 0.23053, + "grad_norm": 1.0245568456074743, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 23053 + }, + { + "epoch": 0.23054, + "grad_norm": 1.1409643629234234, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 23054 + }, + { + "epoch": 0.23055, + "grad_norm": 1.0230247137167217, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 23055 + }, + { + "epoch": 0.23056, + "grad_norm": 1.034256782464772, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 23056 + }, + { + "epoch": 0.23057, + "grad_norm": 1.036737098806506, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 23057 + }, + { + "epoch": 0.23058, + "grad_norm": 1.037351210038279, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 23058 + }, + { + "epoch": 0.23059, + "grad_norm": 0.9519086559547316, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 23059 + }, + { + "epoch": 0.2306, + "grad_norm": 0.8959661672152691, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 23060 + }, + { + "epoch": 0.23061, + "grad_norm": 0.8903507833817338, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 23061 + }, + { + "epoch": 0.23062, + "grad_norm": 0.8807956259666923, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 23062 + }, + { + "epoch": 0.23063, + "grad_norm": 0.9844388200826915, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 23063 + }, + { + "epoch": 0.23064, + "grad_norm": 1.208210992824724, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 23064 + }, + { + "epoch": 0.23065, + "grad_norm": 0.8025078155500408, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 23065 + }, + { + "epoch": 0.23066, + "grad_norm": 0.781969026190564, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 23066 + }, + { + "epoch": 0.23067, + "grad_norm": 0.7525009160614471, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 23067 + }, + { + "epoch": 0.23068, + "grad_norm": 0.7400231509723334, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 23068 + }, + { + "epoch": 0.23069, + "grad_norm": 0.7412303677565018, + "learning_rate": 0.003, + "loss": 4.054, + "step": 23069 + }, + { + "epoch": 0.2307, + "grad_norm": 0.6901599980350567, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 23070 + }, + { + "epoch": 0.23071, + "grad_norm": 0.6291578041103024, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 23071 + }, + { + "epoch": 0.23072, + "grad_norm": 0.6426487652646763, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 23072 + }, + { + "epoch": 0.23073, + "grad_norm": 0.7484153343286141, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 23073 + }, + { + "epoch": 0.23074, + "grad_norm": 0.8591630184030804, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 23074 + }, + { + "epoch": 0.23075, + "grad_norm": 0.9461249230965478, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 23075 + }, + { + "epoch": 0.23076, + "grad_norm": 1.1122012616960164, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 23076 + }, + { + "epoch": 0.23077, + "grad_norm": 1.0383414062056104, + "learning_rate": 0.003, + "loss": 4.048, + "step": 23077 + }, + { + "epoch": 0.23078, + "grad_norm": 0.9458084129572396, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 23078 + }, + { + "epoch": 0.23079, + "grad_norm": 0.8180342130513747, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 23079 + }, + { + "epoch": 0.2308, + "grad_norm": 0.88701227096098, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 23080 + }, + { + "epoch": 0.23081, + "grad_norm": 0.9524663541299678, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 23081 + }, + { + "epoch": 0.23082, + "grad_norm": 1.0767976989171717, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 23082 + }, + { + "epoch": 0.23083, + "grad_norm": 0.9238024718076581, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 23083 + }, + { + "epoch": 0.23084, + "grad_norm": 0.9819205078043135, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 23084 + }, + { + "epoch": 0.23085, + "grad_norm": 1.183068481802817, + "learning_rate": 0.003, + "loss": 4.038, + "step": 23085 + }, + { + "epoch": 0.23086, + "grad_norm": 1.0401157212986032, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 23086 + }, + { + "epoch": 0.23087, + "grad_norm": 1.0400128905396773, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 23087 + }, + { + "epoch": 0.23088, + "grad_norm": 1.0003200233920644, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 23088 + }, + { + "epoch": 0.23089, + "grad_norm": 0.8736122936941668, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 23089 + }, + { + "epoch": 0.2309, + "grad_norm": 0.7665087221210782, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 23090 + }, + { + "epoch": 0.23091, + "grad_norm": 0.8327630270085647, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 23091 + }, + { + "epoch": 0.23092, + "grad_norm": 0.9134355021202011, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 23092 + }, + { + "epoch": 0.23093, + "grad_norm": 1.051066563244841, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 23093 + }, + { + "epoch": 0.23094, + "grad_norm": 1.0214354124435396, + "learning_rate": 0.003, + "loss": 4.053, + "step": 23094 + }, + { + "epoch": 0.23095, + "grad_norm": 0.8958983175184104, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 23095 + }, + { + "epoch": 0.23096, + "grad_norm": 0.9559871365111267, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 23096 + }, + { + "epoch": 0.23097, + "grad_norm": 1.127557277530838, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 23097 + }, + { + "epoch": 0.23098, + "grad_norm": 0.9581631011438779, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 23098 + }, + { + "epoch": 0.23099, + "grad_norm": 0.891490700702359, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 23099 + }, + { + "epoch": 0.231, + "grad_norm": 0.8458326421640169, + "learning_rate": 0.003, + "loss": 4.064, + "step": 23100 + }, + { + "epoch": 0.23101, + "grad_norm": 0.8426055006721699, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 23101 + }, + { + "epoch": 0.23102, + "grad_norm": 0.8186155305996721, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 23102 + }, + { + "epoch": 0.23103, + "grad_norm": 0.7713171618089862, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 23103 + }, + { + "epoch": 0.23104, + "grad_norm": 0.8339620123432792, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 23104 + }, + { + "epoch": 0.23105, + "grad_norm": 0.8259863446093868, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 23105 + }, + { + "epoch": 0.23106, + "grad_norm": 0.8282756771482723, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 23106 + }, + { + "epoch": 0.23107, + "grad_norm": 0.700501615712469, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 23107 + }, + { + "epoch": 0.23108, + "grad_norm": 0.6803929830388348, + "learning_rate": 0.003, + "loss": 4.001, + "step": 23108 + }, + { + "epoch": 0.23109, + "grad_norm": 0.7042258069211237, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 23109 + }, + { + "epoch": 0.2311, + "grad_norm": 0.8167577998600485, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 23110 + }, + { + "epoch": 0.23111, + "grad_norm": 1.1302336645143332, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 23111 + }, + { + "epoch": 0.23112, + "grad_norm": 1.2451510774546128, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 23112 + }, + { + "epoch": 0.23113, + "grad_norm": 0.733608198620074, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 23113 + }, + { + "epoch": 0.23114, + "grad_norm": 0.641177426296802, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 23114 + }, + { + "epoch": 0.23115, + "grad_norm": 0.7308422523981293, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 23115 + }, + { + "epoch": 0.23116, + "grad_norm": 0.8609629458726281, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 23116 + }, + { + "epoch": 0.23117, + "grad_norm": 1.0017284900307113, + "learning_rate": 0.003, + "loss": 4.065, + "step": 23117 + }, + { + "epoch": 0.23118, + "grad_norm": 1.095749053592283, + "learning_rate": 0.003, + "loss": 4.044, + "step": 23118 + }, + { + "epoch": 0.23119, + "grad_norm": 1.0334151111415144, + "learning_rate": 0.003, + "loss": 4.067, + "step": 23119 + }, + { + "epoch": 0.2312, + "grad_norm": 1.136214537832086, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 23120 + }, + { + "epoch": 0.23121, + "grad_norm": 0.8602315868680853, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 23121 + }, + { + "epoch": 0.23122, + "grad_norm": 0.7188755912748588, + "learning_rate": 0.003, + "loss": 4.044, + "step": 23122 + }, + { + "epoch": 0.23123, + "grad_norm": 0.6831341891311914, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 23123 + }, + { + "epoch": 0.23124, + "grad_norm": 0.689386101602556, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 23124 + }, + { + "epoch": 0.23125, + "grad_norm": 0.7949028140751238, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 23125 + }, + { + "epoch": 0.23126, + "grad_norm": 0.8925072686265749, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 23126 + }, + { + "epoch": 0.23127, + "grad_norm": 1.030558238710072, + "learning_rate": 0.003, + "loss": 4.046, + "step": 23127 + }, + { + "epoch": 0.23128, + "grad_norm": 1.1869892077913553, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 23128 + }, + { + "epoch": 0.23129, + "grad_norm": 0.7979618263337956, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 23129 + }, + { + "epoch": 0.2313, + "grad_norm": 0.7541219421787313, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 23130 + }, + { + "epoch": 0.23131, + "grad_norm": 0.728790578622073, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 23131 + }, + { + "epoch": 0.23132, + "grad_norm": 0.6654525142196901, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 23132 + }, + { + "epoch": 0.23133, + "grad_norm": 0.7140478749674254, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 23133 + }, + { + "epoch": 0.23134, + "grad_norm": 0.8223033332588107, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 23134 + }, + { + "epoch": 0.23135, + "grad_norm": 1.0518448763186588, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 23135 + }, + { + "epoch": 0.23136, + "grad_norm": 0.9293349576285221, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 23136 + }, + { + "epoch": 0.23137, + "grad_norm": 0.8788913747286137, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 23137 + }, + { + "epoch": 0.23138, + "grad_norm": 0.9880121834966458, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 23138 + }, + { + "epoch": 0.23139, + "grad_norm": 1.1796886910094022, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 23139 + }, + { + "epoch": 0.2314, + "grad_norm": 0.8964341593056058, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 23140 + }, + { + "epoch": 0.23141, + "grad_norm": 0.9740490172641584, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 23141 + }, + { + "epoch": 0.23142, + "grad_norm": 1.149619858298541, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 23142 + }, + { + "epoch": 0.23143, + "grad_norm": 0.8801661951396532, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 23143 + }, + { + "epoch": 0.23144, + "grad_norm": 0.8646141631667361, + "learning_rate": 0.003, + "loss": 4.037, + "step": 23144 + }, + { + "epoch": 0.23145, + "grad_norm": 0.8077041410823514, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 23145 + }, + { + "epoch": 0.23146, + "grad_norm": 0.9296332270586543, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 23146 + }, + { + "epoch": 0.23147, + "grad_norm": 1.0281759610629906, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 23147 + }, + { + "epoch": 0.23148, + "grad_norm": 0.8332539633933264, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 23148 + }, + { + "epoch": 0.23149, + "grad_norm": 0.6718691492607541, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 23149 + }, + { + "epoch": 0.2315, + "grad_norm": 0.794417872689008, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 23150 + }, + { + "epoch": 0.23151, + "grad_norm": 0.8743968158609591, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 23151 + }, + { + "epoch": 0.23152, + "grad_norm": 0.7940753328224843, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 23152 + }, + { + "epoch": 0.23153, + "grad_norm": 0.7612152619273527, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 23153 + }, + { + "epoch": 0.23154, + "grad_norm": 0.8483713152037101, + "learning_rate": 0.003, + "loss": 4.052, + "step": 23154 + }, + { + "epoch": 0.23155, + "grad_norm": 0.7780967753220992, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 23155 + }, + { + "epoch": 0.23156, + "grad_norm": 0.8751698564680405, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 23156 + }, + { + "epoch": 0.23157, + "grad_norm": 0.9738745667518447, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 23157 + }, + { + "epoch": 0.23158, + "grad_norm": 0.8913304238523949, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 23158 + }, + { + "epoch": 0.23159, + "grad_norm": 1.1645725784000678, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 23159 + }, + { + "epoch": 0.2316, + "grad_norm": 1.1682181889148886, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 23160 + }, + { + "epoch": 0.23161, + "grad_norm": 0.8344153393977471, + "learning_rate": 0.003, + "loss": 4.033, + "step": 23161 + }, + { + "epoch": 0.23162, + "grad_norm": 0.6493871502833799, + "learning_rate": 0.003, + "loss": 4.039, + "step": 23162 + }, + { + "epoch": 0.23163, + "grad_norm": 0.6769824395401683, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 23163 + }, + { + "epoch": 0.23164, + "grad_norm": 0.7366738323543714, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 23164 + }, + { + "epoch": 0.23165, + "grad_norm": 0.7724693297848358, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 23165 + }, + { + "epoch": 0.23166, + "grad_norm": 0.9186543965596209, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 23166 + }, + { + "epoch": 0.23167, + "grad_norm": 1.130344443604793, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 23167 + }, + { + "epoch": 0.23168, + "grad_norm": 0.9086773580393178, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 23168 + }, + { + "epoch": 0.23169, + "grad_norm": 0.8513501304435446, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 23169 + }, + { + "epoch": 0.2317, + "grad_norm": 0.9071029195969358, + "learning_rate": 0.003, + "loss": 4.058, + "step": 23170 + }, + { + "epoch": 0.23171, + "grad_norm": 0.9863758150106552, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 23171 + }, + { + "epoch": 0.23172, + "grad_norm": 1.1227519604261884, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 23172 + }, + { + "epoch": 0.23173, + "grad_norm": 0.8465970644388893, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 23173 + }, + { + "epoch": 0.23174, + "grad_norm": 0.8323049945940443, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 23174 + }, + { + "epoch": 0.23175, + "grad_norm": 0.8207785622563308, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 23175 + }, + { + "epoch": 0.23176, + "grad_norm": 0.8776604899774045, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 23176 + }, + { + "epoch": 0.23177, + "grad_norm": 1.2362603218990649, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 23177 + }, + { + "epoch": 0.23178, + "grad_norm": 0.8435599941035459, + "learning_rate": 0.003, + "loss": 4.041, + "step": 23178 + }, + { + "epoch": 0.23179, + "grad_norm": 0.8611331828490991, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 23179 + }, + { + "epoch": 0.2318, + "grad_norm": 0.995223681188264, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 23180 + }, + { + "epoch": 0.23181, + "grad_norm": 1.0379370750698478, + "learning_rate": 0.003, + "loss": 4.082, + "step": 23181 + }, + { + "epoch": 0.23182, + "grad_norm": 0.9090692891478289, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 23182 + }, + { + "epoch": 0.23183, + "grad_norm": 0.8277825840618702, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 23183 + }, + { + "epoch": 0.23184, + "grad_norm": 0.8456226998971755, + "learning_rate": 0.003, + "loss": 4.057, + "step": 23184 + }, + { + "epoch": 0.23185, + "grad_norm": 0.8720691591032947, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 23185 + }, + { + "epoch": 0.23186, + "grad_norm": 0.9220562305550305, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 23186 + }, + { + "epoch": 0.23187, + "grad_norm": 0.851438738350241, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 23187 + }, + { + "epoch": 0.23188, + "grad_norm": 0.7818525520935256, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 23188 + }, + { + "epoch": 0.23189, + "grad_norm": 0.7286503035114896, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 23189 + }, + { + "epoch": 0.2319, + "grad_norm": 0.7421027077402059, + "learning_rate": 0.003, + "loss": 4.029, + "step": 23190 + }, + { + "epoch": 0.23191, + "grad_norm": 0.7123163633494964, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 23191 + }, + { + "epoch": 0.23192, + "grad_norm": 0.7167199220165789, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 23192 + }, + { + "epoch": 0.23193, + "grad_norm": 0.7218936007899907, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 23193 + }, + { + "epoch": 0.23194, + "grad_norm": 0.7383427430131531, + "learning_rate": 0.003, + "loss": 4.009, + "step": 23194 + }, + { + "epoch": 0.23195, + "grad_norm": 0.7579351615694749, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 23195 + }, + { + "epoch": 0.23196, + "grad_norm": 0.8254898925429789, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 23196 + }, + { + "epoch": 0.23197, + "grad_norm": 1.1471002586596273, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 23197 + }, + { + "epoch": 0.23198, + "grad_norm": 1.2317359165769932, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 23198 + }, + { + "epoch": 0.23199, + "grad_norm": 0.8705700836540348, + "learning_rate": 0.003, + "loss": 4.057, + "step": 23199 + }, + { + "epoch": 0.232, + "grad_norm": 0.8386864939544902, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 23200 + }, + { + "epoch": 0.23201, + "grad_norm": 0.8831417143911874, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 23201 + }, + { + "epoch": 0.23202, + "grad_norm": 0.8989113483203732, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 23202 + }, + { + "epoch": 0.23203, + "grad_norm": 0.9319952181445379, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 23203 + }, + { + "epoch": 0.23204, + "grad_norm": 1.1234857880248483, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 23204 + }, + { + "epoch": 0.23205, + "grad_norm": 0.9646882635849044, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 23205 + }, + { + "epoch": 0.23206, + "grad_norm": 1.1204598275221551, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 23206 + }, + { + "epoch": 0.23207, + "grad_norm": 0.8468977841556461, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 23207 + }, + { + "epoch": 0.23208, + "grad_norm": 0.8736054694554825, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 23208 + }, + { + "epoch": 0.23209, + "grad_norm": 0.793857771129506, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 23209 + }, + { + "epoch": 0.2321, + "grad_norm": 0.7829459826281823, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 23210 + }, + { + "epoch": 0.23211, + "grad_norm": 0.9743869245867949, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 23211 + }, + { + "epoch": 0.23212, + "grad_norm": 1.0099969444714338, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 23212 + }, + { + "epoch": 0.23213, + "grad_norm": 0.9772473599852297, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 23213 + }, + { + "epoch": 0.23214, + "grad_norm": 1.1090191942947476, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 23214 + }, + { + "epoch": 0.23215, + "grad_norm": 0.9809795265303912, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 23215 + }, + { + "epoch": 0.23216, + "grad_norm": 1.1022901248934491, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 23216 + }, + { + "epoch": 0.23217, + "grad_norm": 0.9994053700063643, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 23217 + }, + { + "epoch": 0.23218, + "grad_norm": 0.9518282222486462, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 23218 + }, + { + "epoch": 0.23219, + "grad_norm": 0.8901810578367069, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 23219 + }, + { + "epoch": 0.2322, + "grad_norm": 0.7379109720421644, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 23220 + }, + { + "epoch": 0.23221, + "grad_norm": 0.6304443394079544, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 23221 + }, + { + "epoch": 0.23222, + "grad_norm": 0.6039519856203805, + "learning_rate": 0.003, + "loss": 4.024, + "step": 23222 + }, + { + "epoch": 0.23223, + "grad_norm": 0.7261943503246203, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 23223 + }, + { + "epoch": 0.23224, + "grad_norm": 0.8061088900498037, + "learning_rate": 0.003, + "loss": 4.044, + "step": 23224 + }, + { + "epoch": 0.23225, + "grad_norm": 0.8482348430318354, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 23225 + }, + { + "epoch": 0.23226, + "grad_norm": 0.9174531207170211, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 23226 + }, + { + "epoch": 0.23227, + "grad_norm": 1.1570666209810108, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 23227 + }, + { + "epoch": 0.23228, + "grad_norm": 0.9107358994765787, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 23228 + }, + { + "epoch": 0.23229, + "grad_norm": 0.867793013833244, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 23229 + }, + { + "epoch": 0.2323, + "grad_norm": 0.8031669535488789, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 23230 + }, + { + "epoch": 0.23231, + "grad_norm": 0.9526980439082624, + "learning_rate": 0.003, + "loss": 4.021, + "step": 23231 + }, + { + "epoch": 0.23232, + "grad_norm": 1.0983306831463389, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 23232 + }, + { + "epoch": 0.23233, + "grad_norm": 0.9172141034188522, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 23233 + }, + { + "epoch": 0.23234, + "grad_norm": 0.8780316601312603, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 23234 + }, + { + "epoch": 0.23235, + "grad_norm": 0.8657486185162844, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 23235 + }, + { + "epoch": 0.23236, + "grad_norm": 0.7867284711629754, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 23236 + }, + { + "epoch": 0.23237, + "grad_norm": 0.8593762302387505, + "learning_rate": 0.003, + "loss": 4.046, + "step": 23237 + }, + { + "epoch": 0.23238, + "grad_norm": 0.999200770415388, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 23238 + }, + { + "epoch": 0.23239, + "grad_norm": 0.9900950045281023, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 23239 + }, + { + "epoch": 0.2324, + "grad_norm": 1.0001850110999082, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 23240 + }, + { + "epoch": 0.23241, + "grad_norm": 0.9767799123132489, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 23241 + }, + { + "epoch": 0.23242, + "grad_norm": 1.0858482824867202, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 23242 + }, + { + "epoch": 0.23243, + "grad_norm": 1.0408044330079458, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 23243 + }, + { + "epoch": 0.23244, + "grad_norm": 0.9928541307010126, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 23244 + }, + { + "epoch": 0.23245, + "grad_norm": 0.8675087983831122, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 23245 + }, + { + "epoch": 0.23246, + "grad_norm": 0.8981112837908503, + "learning_rate": 0.003, + "loss": 4.077, + "step": 23246 + }, + { + "epoch": 0.23247, + "grad_norm": 0.9328525917233651, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 23247 + }, + { + "epoch": 0.23248, + "grad_norm": 0.9756171868427217, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 23248 + }, + { + "epoch": 0.23249, + "grad_norm": 1.0950239360776952, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 23249 + }, + { + "epoch": 0.2325, + "grad_norm": 1.0783311333497028, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 23250 + }, + { + "epoch": 0.23251, + "grad_norm": 1.1335158849567175, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 23251 + }, + { + "epoch": 0.23252, + "grad_norm": 0.8492377915473657, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 23252 + }, + { + "epoch": 0.23253, + "grad_norm": 0.8559040391897444, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 23253 + }, + { + "epoch": 0.23254, + "grad_norm": 0.8212510012899998, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 23254 + }, + { + "epoch": 0.23255, + "grad_norm": 0.8349638108267292, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 23255 + }, + { + "epoch": 0.23256, + "grad_norm": 0.8669598562870442, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 23256 + }, + { + "epoch": 0.23257, + "grad_norm": 1.2031135123573853, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 23257 + }, + { + "epoch": 0.23258, + "grad_norm": 1.132960801067764, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 23258 + }, + { + "epoch": 0.23259, + "grad_norm": 0.9274153111540889, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 23259 + }, + { + "epoch": 0.2326, + "grad_norm": 0.8392512702203906, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 23260 + }, + { + "epoch": 0.23261, + "grad_norm": 0.8153444941138165, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 23261 + }, + { + "epoch": 0.23262, + "grad_norm": 0.7650530271338066, + "learning_rate": 0.003, + "loss": 4.026, + "step": 23262 + }, + { + "epoch": 0.23263, + "grad_norm": 0.7072048818401927, + "learning_rate": 0.003, + "loss": 4.051, + "step": 23263 + }, + { + "epoch": 0.23264, + "grad_norm": 0.7434278611886409, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 23264 + }, + { + "epoch": 0.23265, + "grad_norm": 0.6969863964264411, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 23265 + }, + { + "epoch": 0.23266, + "grad_norm": 0.5921543446129753, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 23266 + }, + { + "epoch": 0.23267, + "grad_norm": 0.6246915459886806, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 23267 + }, + { + "epoch": 0.23268, + "grad_norm": 0.6071232070964971, + "learning_rate": 0.003, + "loss": 4.034, + "step": 23268 + }, + { + "epoch": 0.23269, + "grad_norm": 0.6120017112848584, + "learning_rate": 0.003, + "loss": 4.047, + "step": 23269 + }, + { + "epoch": 0.2327, + "grad_norm": 0.7381727894488059, + "learning_rate": 0.003, + "loss": 4.049, + "step": 23270 + }, + { + "epoch": 0.23271, + "grad_norm": 0.9377226044305099, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 23271 + }, + { + "epoch": 0.23272, + "grad_norm": 0.9610306381192282, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 23272 + }, + { + "epoch": 0.23273, + "grad_norm": 0.9320320224352803, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 23273 + }, + { + "epoch": 0.23274, + "grad_norm": 0.8911826352988351, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 23274 + }, + { + "epoch": 0.23275, + "grad_norm": 0.8105825721007471, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 23275 + }, + { + "epoch": 0.23276, + "grad_norm": 0.8670830025954498, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 23276 + }, + { + "epoch": 0.23277, + "grad_norm": 0.8657927094012594, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 23277 + }, + { + "epoch": 0.23278, + "grad_norm": 0.7708245466487271, + "learning_rate": 0.003, + "loss": 4.046, + "step": 23278 + }, + { + "epoch": 0.23279, + "grad_norm": 0.7787466473254215, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 23279 + }, + { + "epoch": 0.2328, + "grad_norm": 0.8164552198515813, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 23280 + }, + { + "epoch": 0.23281, + "grad_norm": 0.8927138473434831, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 23281 + }, + { + "epoch": 0.23282, + "grad_norm": 0.8876102475955792, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 23282 + }, + { + "epoch": 0.23283, + "grad_norm": 0.9087671931379016, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 23283 + }, + { + "epoch": 0.23284, + "grad_norm": 0.9469167290019652, + "learning_rate": 0.003, + "loss": 4.054, + "step": 23284 + }, + { + "epoch": 0.23285, + "grad_norm": 1.101195157510704, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 23285 + }, + { + "epoch": 0.23286, + "grad_norm": 1.1383771773928801, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 23286 + }, + { + "epoch": 0.23287, + "grad_norm": 1.1346771594819132, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 23287 + }, + { + "epoch": 0.23288, + "grad_norm": 0.9967749570008625, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 23288 + }, + { + "epoch": 0.23289, + "grad_norm": 0.9779788333655552, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 23289 + }, + { + "epoch": 0.2329, + "grad_norm": 0.96736155236567, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 23290 + }, + { + "epoch": 0.23291, + "grad_norm": 0.9867814017940819, + "learning_rate": 0.003, + "loss": 4.066, + "step": 23291 + }, + { + "epoch": 0.23292, + "grad_norm": 1.0294637262664235, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 23292 + }, + { + "epoch": 0.23293, + "grad_norm": 0.7994950119197537, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 23293 + }, + { + "epoch": 0.23294, + "grad_norm": 0.7481805754458921, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 23294 + }, + { + "epoch": 0.23295, + "grad_norm": 0.7650244135894871, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 23295 + }, + { + "epoch": 0.23296, + "grad_norm": 1.013495268242076, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 23296 + }, + { + "epoch": 0.23297, + "grad_norm": 1.164764749176224, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 23297 + }, + { + "epoch": 0.23298, + "grad_norm": 0.8960344235012595, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 23298 + }, + { + "epoch": 0.23299, + "grad_norm": 0.8881564501743127, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 23299 + }, + { + "epoch": 0.233, + "grad_norm": 0.9350007006593121, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 23300 + }, + { + "epoch": 0.23301, + "grad_norm": 0.9548358908147525, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 23301 + }, + { + "epoch": 0.23302, + "grad_norm": 1.0334268930845638, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 23302 + }, + { + "epoch": 0.23303, + "grad_norm": 1.0357680667654454, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 23303 + }, + { + "epoch": 0.23304, + "grad_norm": 1.0529152874427279, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 23304 + }, + { + "epoch": 0.23305, + "grad_norm": 0.9156996550201966, + "learning_rate": 0.003, + "loss": 4.064, + "step": 23305 + }, + { + "epoch": 0.23306, + "grad_norm": 0.8940837746911591, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 23306 + }, + { + "epoch": 0.23307, + "grad_norm": 0.9965133688987783, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 23307 + }, + { + "epoch": 0.23308, + "grad_norm": 0.8363566760855301, + "learning_rate": 0.003, + "loss": 4.035, + "step": 23308 + }, + { + "epoch": 0.23309, + "grad_norm": 0.7406233933871841, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 23309 + }, + { + "epoch": 0.2331, + "grad_norm": 0.7408562529044742, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 23310 + }, + { + "epoch": 0.23311, + "grad_norm": 0.713263037142467, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 23311 + }, + { + "epoch": 0.23312, + "grad_norm": 0.7928308735562886, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 23312 + }, + { + "epoch": 0.23313, + "grad_norm": 0.871717260427556, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 23313 + }, + { + "epoch": 0.23314, + "grad_norm": 1.0558291387356178, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 23314 + }, + { + "epoch": 0.23315, + "grad_norm": 0.944503677573255, + "learning_rate": 0.003, + "loss": 4.036, + "step": 23315 + }, + { + "epoch": 0.23316, + "grad_norm": 0.8115335344824361, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 23316 + }, + { + "epoch": 0.23317, + "grad_norm": 0.7557550645339812, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 23317 + }, + { + "epoch": 0.23318, + "grad_norm": 0.8046906030587745, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 23318 + }, + { + "epoch": 0.23319, + "grad_norm": 0.9800549503027157, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 23319 + }, + { + "epoch": 0.2332, + "grad_norm": 0.9080598060586338, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 23320 + }, + { + "epoch": 0.23321, + "grad_norm": 0.847657931039378, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 23321 + }, + { + "epoch": 0.23322, + "grad_norm": 0.820622327753731, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 23322 + }, + { + "epoch": 0.23323, + "grad_norm": 0.8095692117922965, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 23323 + }, + { + "epoch": 0.23324, + "grad_norm": 0.826232442867, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 23324 + }, + { + "epoch": 0.23325, + "grad_norm": 0.7780695456310369, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 23325 + }, + { + "epoch": 0.23326, + "grad_norm": 0.7433992194672006, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 23326 + }, + { + "epoch": 0.23327, + "grad_norm": 0.7962462661564261, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 23327 + }, + { + "epoch": 0.23328, + "grad_norm": 0.9421042256332244, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 23328 + }, + { + "epoch": 0.23329, + "grad_norm": 1.1195373580440857, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 23329 + }, + { + "epoch": 0.2333, + "grad_norm": 1.0369531212452263, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 23330 + }, + { + "epoch": 0.23331, + "grad_norm": 1.0902708414049547, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 23331 + }, + { + "epoch": 0.23332, + "grad_norm": 0.9376568194493495, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 23332 + }, + { + "epoch": 0.23333, + "grad_norm": 0.8358816175056863, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 23333 + }, + { + "epoch": 0.23334, + "grad_norm": 0.8770463459751547, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 23334 + }, + { + "epoch": 0.23335, + "grad_norm": 0.9084869034696209, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 23335 + }, + { + "epoch": 0.23336, + "grad_norm": 0.9164647954125518, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 23336 + }, + { + "epoch": 0.23337, + "grad_norm": 0.9468103515429476, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 23337 + }, + { + "epoch": 0.23338, + "grad_norm": 0.9603457233002876, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 23338 + }, + { + "epoch": 0.23339, + "grad_norm": 0.8631892790024719, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 23339 + }, + { + "epoch": 0.2334, + "grad_norm": 0.8641952703679369, + "learning_rate": 0.003, + "loss": 4.069, + "step": 23340 + }, + { + "epoch": 0.23341, + "grad_norm": 0.8706020331767145, + "learning_rate": 0.003, + "loss": 4.059, + "step": 23341 + }, + { + "epoch": 0.23342, + "grad_norm": 1.013057060348118, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 23342 + }, + { + "epoch": 0.23343, + "grad_norm": 1.1822166227602056, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 23343 + }, + { + "epoch": 0.23344, + "grad_norm": 0.9738003062187575, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 23344 + }, + { + "epoch": 0.23345, + "grad_norm": 0.8503096136492654, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 23345 + }, + { + "epoch": 0.23346, + "grad_norm": 0.6910227370125558, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 23346 + }, + { + "epoch": 0.23347, + "grad_norm": 0.7265346072820527, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 23347 + }, + { + "epoch": 0.23348, + "grad_norm": 0.8112023118736004, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 23348 + }, + { + "epoch": 0.23349, + "grad_norm": 1.0429340872049848, + "learning_rate": 0.003, + "loss": 4.014, + "step": 23349 + }, + { + "epoch": 0.2335, + "grad_norm": 1.110429019074852, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 23350 + }, + { + "epoch": 0.23351, + "grad_norm": 1.0566018295048674, + "learning_rate": 0.003, + "loss": 4.029, + "step": 23351 + }, + { + "epoch": 0.23352, + "grad_norm": 1.00512196214799, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 23352 + }, + { + "epoch": 0.23353, + "grad_norm": 0.8489244570738428, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 23353 + }, + { + "epoch": 0.23354, + "grad_norm": 0.7565669660831456, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 23354 + }, + { + "epoch": 0.23355, + "grad_norm": 0.9221691464711177, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 23355 + }, + { + "epoch": 0.23356, + "grad_norm": 1.085255935128597, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 23356 + }, + { + "epoch": 0.23357, + "grad_norm": 0.9103587875579616, + "learning_rate": 0.003, + "loss": 4.074, + "step": 23357 + }, + { + "epoch": 0.23358, + "grad_norm": 0.8512063629589692, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 23358 + }, + { + "epoch": 0.23359, + "grad_norm": 0.7470681476112401, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 23359 + }, + { + "epoch": 0.2336, + "grad_norm": 0.5961140625364454, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 23360 + }, + { + "epoch": 0.23361, + "grad_norm": 0.5798801546356659, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 23361 + }, + { + "epoch": 0.23362, + "grad_norm": 0.5902421624891833, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 23362 + }, + { + "epoch": 0.23363, + "grad_norm": 0.6999184120311019, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 23363 + }, + { + "epoch": 0.23364, + "grad_norm": 0.8893670383757734, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 23364 + }, + { + "epoch": 0.23365, + "grad_norm": 1.219470143245809, + "learning_rate": 0.003, + "loss": 4.049, + "step": 23365 + }, + { + "epoch": 0.23366, + "grad_norm": 0.8412023347162878, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 23366 + }, + { + "epoch": 0.23367, + "grad_norm": 0.7462332736457706, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 23367 + }, + { + "epoch": 0.23368, + "grad_norm": 0.7395531664941113, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 23368 + }, + { + "epoch": 0.23369, + "grad_norm": 0.9318765351397753, + "learning_rate": 0.003, + "loss": 4.017, + "step": 23369 + }, + { + "epoch": 0.2337, + "grad_norm": 1.0284846386411144, + "learning_rate": 0.003, + "loss": 4.039, + "step": 23370 + }, + { + "epoch": 0.23371, + "grad_norm": 1.082375674841975, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 23371 + }, + { + "epoch": 0.23372, + "grad_norm": 0.8681449169859182, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 23372 + }, + { + "epoch": 0.23373, + "grad_norm": 0.7643782292642143, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 23373 + }, + { + "epoch": 0.23374, + "grad_norm": 0.7886293105726657, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 23374 + }, + { + "epoch": 0.23375, + "grad_norm": 0.8184168061138848, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 23375 + }, + { + "epoch": 0.23376, + "grad_norm": 0.7990018029882848, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 23376 + }, + { + "epoch": 0.23377, + "grad_norm": 0.748870729955507, + "learning_rate": 0.003, + "loss": 4.053, + "step": 23377 + }, + { + "epoch": 0.23378, + "grad_norm": 0.9132468595526786, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 23378 + }, + { + "epoch": 0.23379, + "grad_norm": 1.0098402223700262, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 23379 + }, + { + "epoch": 0.2338, + "grad_norm": 1.1381184887449074, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 23380 + }, + { + "epoch": 0.23381, + "grad_norm": 0.9805527873250193, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 23381 + }, + { + "epoch": 0.23382, + "grad_norm": 1.000360382070932, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 23382 + }, + { + "epoch": 0.23383, + "grad_norm": 0.9980224432070521, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 23383 + }, + { + "epoch": 0.23384, + "grad_norm": 1.1132622676282136, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 23384 + }, + { + "epoch": 0.23385, + "grad_norm": 0.9143593082991279, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 23385 + }, + { + "epoch": 0.23386, + "grad_norm": 0.8361349588005175, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 23386 + }, + { + "epoch": 0.23387, + "grad_norm": 0.8341700939575358, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 23387 + }, + { + "epoch": 0.23388, + "grad_norm": 0.8544316720492576, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 23388 + }, + { + "epoch": 0.23389, + "grad_norm": 0.8194641054665752, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 23389 + }, + { + "epoch": 0.2339, + "grad_norm": 0.7695433452794722, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 23390 + }, + { + "epoch": 0.23391, + "grad_norm": 0.8791339635741777, + "learning_rate": 0.003, + "loss": 4.063, + "step": 23391 + }, + { + "epoch": 0.23392, + "grad_norm": 0.9674610737913872, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 23392 + }, + { + "epoch": 0.23393, + "grad_norm": 1.0394141583490182, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 23393 + }, + { + "epoch": 0.23394, + "grad_norm": 1.1078255857520207, + "learning_rate": 0.003, + "loss": 4.06, + "step": 23394 + }, + { + "epoch": 0.23395, + "grad_norm": 1.023291222036471, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 23395 + }, + { + "epoch": 0.23396, + "grad_norm": 0.8847425875234746, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 23396 + }, + { + "epoch": 0.23397, + "grad_norm": 0.9994241022835657, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 23397 + }, + { + "epoch": 0.23398, + "grad_norm": 1.066314590277396, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 23398 + }, + { + "epoch": 0.23399, + "grad_norm": 0.9337551056905361, + "learning_rate": 0.003, + "loss": 4.088, + "step": 23399 + }, + { + "epoch": 0.234, + "grad_norm": 0.9919661609265504, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 23400 + }, + { + "epoch": 0.23401, + "grad_norm": 0.9930058711932699, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 23401 + }, + { + "epoch": 0.23402, + "grad_norm": 0.946035310376112, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 23402 + }, + { + "epoch": 0.23403, + "grad_norm": 0.9479158532294484, + "learning_rate": 0.003, + "loss": 4.057, + "step": 23403 + }, + { + "epoch": 0.23404, + "grad_norm": 0.9887494895668005, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 23404 + }, + { + "epoch": 0.23405, + "grad_norm": 1.0795738494837765, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 23405 + }, + { + "epoch": 0.23406, + "grad_norm": 0.9748867888107192, + "learning_rate": 0.003, + "loss": 4.063, + "step": 23406 + }, + { + "epoch": 0.23407, + "grad_norm": 0.9747261742240523, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 23407 + }, + { + "epoch": 0.23408, + "grad_norm": 0.9439911353112621, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 23408 + }, + { + "epoch": 0.23409, + "grad_norm": 0.9858693723609644, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 23409 + }, + { + "epoch": 0.2341, + "grad_norm": 1.0619208062979824, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 23410 + }, + { + "epoch": 0.23411, + "grad_norm": 0.9572973427946112, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 23411 + }, + { + "epoch": 0.23412, + "grad_norm": 0.9011634764609106, + "learning_rate": 0.003, + "loss": 4.052, + "step": 23412 + }, + { + "epoch": 0.23413, + "grad_norm": 0.9263551959365492, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 23413 + }, + { + "epoch": 0.23414, + "grad_norm": 0.8628853629024509, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 23414 + }, + { + "epoch": 0.23415, + "grad_norm": 0.8275678625042461, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 23415 + }, + { + "epoch": 0.23416, + "grad_norm": 0.7840972622886222, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 23416 + }, + { + "epoch": 0.23417, + "grad_norm": 0.8182833085317417, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 23417 + }, + { + "epoch": 0.23418, + "grad_norm": 0.8769906378803388, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 23418 + }, + { + "epoch": 0.23419, + "grad_norm": 0.9113616275585473, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 23419 + }, + { + "epoch": 0.2342, + "grad_norm": 0.9554640268768768, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 23420 + }, + { + "epoch": 0.23421, + "grad_norm": 0.9315320557270598, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 23421 + }, + { + "epoch": 0.23422, + "grad_norm": 0.7702007732696645, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 23422 + }, + { + "epoch": 0.23423, + "grad_norm": 0.790431807343337, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 23423 + }, + { + "epoch": 0.23424, + "grad_norm": 0.7609546132057345, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 23424 + }, + { + "epoch": 0.23425, + "grad_norm": 0.6985684874736482, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 23425 + }, + { + "epoch": 0.23426, + "grad_norm": 0.6891898397494017, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 23426 + }, + { + "epoch": 0.23427, + "grad_norm": 0.7279093681307923, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 23427 + }, + { + "epoch": 0.23428, + "grad_norm": 0.8441303383782134, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 23428 + }, + { + "epoch": 0.23429, + "grad_norm": 1.04643159548747, + "learning_rate": 0.003, + "loss": 4.065, + "step": 23429 + }, + { + "epoch": 0.2343, + "grad_norm": 1.2918401876269727, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 23430 + }, + { + "epoch": 0.23431, + "grad_norm": 0.6711194521983928, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 23431 + }, + { + "epoch": 0.23432, + "grad_norm": 0.6874738068004878, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 23432 + }, + { + "epoch": 0.23433, + "grad_norm": 0.7826447289270644, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 23433 + }, + { + "epoch": 0.23434, + "grad_norm": 0.701971592804287, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 23434 + }, + { + "epoch": 0.23435, + "grad_norm": 0.6101785123639616, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 23435 + }, + { + "epoch": 0.23436, + "grad_norm": 0.5880009523193628, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 23436 + }, + { + "epoch": 0.23437, + "grad_norm": 0.6339926810602442, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 23437 + }, + { + "epoch": 0.23438, + "grad_norm": 0.6833875393171981, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 23438 + }, + { + "epoch": 0.23439, + "grad_norm": 0.7916025228216734, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 23439 + }, + { + "epoch": 0.2344, + "grad_norm": 0.8697467276483367, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 23440 + }, + { + "epoch": 0.23441, + "grad_norm": 1.100718052765827, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 23441 + }, + { + "epoch": 0.23442, + "grad_norm": 1.053195700373657, + "learning_rate": 0.003, + "loss": 4.077, + "step": 23442 + }, + { + "epoch": 0.23443, + "grad_norm": 0.8617135427224588, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 23443 + }, + { + "epoch": 0.23444, + "grad_norm": 0.7032869299540702, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 23444 + }, + { + "epoch": 0.23445, + "grad_norm": 0.6923348590258565, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 23445 + }, + { + "epoch": 0.23446, + "grad_norm": 0.7284603533865156, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 23446 + }, + { + "epoch": 0.23447, + "grad_norm": 0.8536187828216245, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 23447 + }, + { + "epoch": 0.23448, + "grad_norm": 0.995310467170776, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 23448 + }, + { + "epoch": 0.23449, + "grad_norm": 1.2160398849176544, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 23449 + }, + { + "epoch": 0.2345, + "grad_norm": 0.8882534098429892, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 23450 + }, + { + "epoch": 0.23451, + "grad_norm": 0.8615827600129583, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 23451 + }, + { + "epoch": 0.23452, + "grad_norm": 0.9464588359712807, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 23452 + }, + { + "epoch": 0.23453, + "grad_norm": 0.9605578950153069, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 23453 + }, + { + "epoch": 0.23454, + "grad_norm": 1.0168215253821362, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 23454 + }, + { + "epoch": 0.23455, + "grad_norm": 1.0290202474970156, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 23455 + }, + { + "epoch": 0.23456, + "grad_norm": 0.9892988617362561, + "learning_rate": 0.003, + "loss": 4.058, + "step": 23456 + }, + { + "epoch": 0.23457, + "grad_norm": 0.8197320854755072, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 23457 + }, + { + "epoch": 0.23458, + "grad_norm": 0.8537843438185317, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 23458 + }, + { + "epoch": 0.23459, + "grad_norm": 0.9417005757161352, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 23459 + }, + { + "epoch": 0.2346, + "grad_norm": 0.9969534769222956, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 23460 + }, + { + "epoch": 0.23461, + "grad_norm": 1.136723325063256, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 23461 + }, + { + "epoch": 0.23462, + "grad_norm": 1.045664861745029, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 23462 + }, + { + "epoch": 0.23463, + "grad_norm": 0.9670235398040552, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 23463 + }, + { + "epoch": 0.23464, + "grad_norm": 0.9860658741113812, + "learning_rate": 0.003, + "loss": 4.068, + "step": 23464 + }, + { + "epoch": 0.23465, + "grad_norm": 0.9978464502003478, + "learning_rate": 0.003, + "loss": 4.071, + "step": 23465 + }, + { + "epoch": 0.23466, + "grad_norm": 0.9673931812832036, + "learning_rate": 0.003, + "loss": 4.061, + "step": 23466 + }, + { + "epoch": 0.23467, + "grad_norm": 0.8699565165568466, + "learning_rate": 0.003, + "loss": 4.063, + "step": 23467 + }, + { + "epoch": 0.23468, + "grad_norm": 0.9065795198411963, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 23468 + }, + { + "epoch": 0.23469, + "grad_norm": 0.75466624961184, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 23469 + }, + { + "epoch": 0.2347, + "grad_norm": 0.6561214345777504, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 23470 + }, + { + "epoch": 0.23471, + "grad_norm": 0.6309503611918389, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 23471 + }, + { + "epoch": 0.23472, + "grad_norm": 0.743276881969359, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 23472 + }, + { + "epoch": 0.23473, + "grad_norm": 0.8782294109824252, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 23473 + }, + { + "epoch": 0.23474, + "grad_norm": 1.0074254170883652, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 23474 + }, + { + "epoch": 0.23475, + "grad_norm": 0.9881534497496233, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 23475 + }, + { + "epoch": 0.23476, + "grad_norm": 0.8269256727291021, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 23476 + }, + { + "epoch": 0.23477, + "grad_norm": 0.6738745656842985, + "learning_rate": 0.003, + "loss": 4.057, + "step": 23477 + }, + { + "epoch": 0.23478, + "grad_norm": 0.6188196020134842, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 23478 + }, + { + "epoch": 0.23479, + "grad_norm": 0.6708426760650309, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 23479 + }, + { + "epoch": 0.2348, + "grad_norm": 0.8527239421195897, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 23480 + }, + { + "epoch": 0.23481, + "grad_norm": 1.0777377799731296, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 23481 + }, + { + "epoch": 0.23482, + "grad_norm": 0.9077010630127782, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 23482 + }, + { + "epoch": 0.23483, + "grad_norm": 0.8096723144590248, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 23483 + }, + { + "epoch": 0.23484, + "grad_norm": 0.6454451594198872, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 23484 + }, + { + "epoch": 0.23485, + "grad_norm": 0.634693890899822, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 23485 + }, + { + "epoch": 0.23486, + "grad_norm": 0.7590901521337867, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 23486 + }, + { + "epoch": 0.23487, + "grad_norm": 0.8631093358471466, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 23487 + }, + { + "epoch": 0.23488, + "grad_norm": 0.8836530459599684, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 23488 + }, + { + "epoch": 0.23489, + "grad_norm": 0.8167953744867498, + "learning_rate": 0.003, + "loss": 4.065, + "step": 23489 + }, + { + "epoch": 0.2349, + "grad_norm": 0.7322969154946962, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 23490 + }, + { + "epoch": 0.23491, + "grad_norm": 0.7532723755276917, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 23491 + }, + { + "epoch": 0.23492, + "grad_norm": 0.9038800318880632, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 23492 + }, + { + "epoch": 0.23493, + "grad_norm": 0.9920619300549424, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 23493 + }, + { + "epoch": 0.23494, + "grad_norm": 1.0187695353063326, + "learning_rate": 0.003, + "loss": 4.065, + "step": 23494 + }, + { + "epoch": 0.23495, + "grad_norm": 1.1778584734092041, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 23495 + }, + { + "epoch": 0.23496, + "grad_norm": 0.9224805363428353, + "learning_rate": 0.003, + "loss": 4.051, + "step": 23496 + }, + { + "epoch": 0.23497, + "grad_norm": 0.9436997654433646, + "learning_rate": 0.003, + "loss": 4.03, + "step": 23497 + }, + { + "epoch": 0.23498, + "grad_norm": 0.9952993171817909, + "learning_rate": 0.003, + "loss": 4.036, + "step": 23498 + }, + { + "epoch": 0.23499, + "grad_norm": 0.995203368537637, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 23499 + }, + { + "epoch": 0.235, + "grad_norm": 1.1294019839979683, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 23500 + }, + { + "epoch": 0.23501, + "grad_norm": 0.9612324528905263, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 23501 + }, + { + "epoch": 0.23502, + "grad_norm": 0.9921975929743411, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 23502 + }, + { + "epoch": 0.23503, + "grad_norm": 1.0270290566550475, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 23503 + }, + { + "epoch": 0.23504, + "grad_norm": 1.036780613639376, + "learning_rate": 0.003, + "loss": 4.052, + "step": 23504 + }, + { + "epoch": 0.23505, + "grad_norm": 1.029123150239687, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 23505 + }, + { + "epoch": 0.23506, + "grad_norm": 0.946117669682993, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 23506 + }, + { + "epoch": 0.23507, + "grad_norm": 1.1144297464145339, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 23507 + }, + { + "epoch": 0.23508, + "grad_norm": 0.9799699580148787, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 23508 + }, + { + "epoch": 0.23509, + "grad_norm": 0.9093323460452115, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 23509 + }, + { + "epoch": 0.2351, + "grad_norm": 0.9409869663730628, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 23510 + }, + { + "epoch": 0.23511, + "grad_norm": 0.9794146986958985, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 23511 + }, + { + "epoch": 0.23512, + "grad_norm": 1.03550774538799, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 23512 + }, + { + "epoch": 0.23513, + "grad_norm": 0.8167910926584461, + "learning_rate": 0.003, + "loss": 4.058, + "step": 23513 + }, + { + "epoch": 0.23514, + "grad_norm": 0.7470325975213249, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 23514 + }, + { + "epoch": 0.23515, + "grad_norm": 0.7206279401531992, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 23515 + }, + { + "epoch": 0.23516, + "grad_norm": 0.7518204968408843, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 23516 + }, + { + "epoch": 0.23517, + "grad_norm": 0.7967049472635556, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 23517 + }, + { + "epoch": 0.23518, + "grad_norm": 0.9542519789857323, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 23518 + }, + { + "epoch": 0.23519, + "grad_norm": 1.027237973204826, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 23519 + }, + { + "epoch": 0.2352, + "grad_norm": 0.9077309554712739, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 23520 + }, + { + "epoch": 0.23521, + "grad_norm": 0.9353060411573507, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 23521 + }, + { + "epoch": 0.23522, + "grad_norm": 0.9169242440742275, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 23522 + }, + { + "epoch": 0.23523, + "grad_norm": 0.9197349234314814, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 23523 + }, + { + "epoch": 0.23524, + "grad_norm": 0.7478110805490918, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 23524 + }, + { + "epoch": 0.23525, + "grad_norm": 0.6723302418236689, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 23525 + }, + { + "epoch": 0.23526, + "grad_norm": 0.7146151303034055, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 23526 + }, + { + "epoch": 0.23527, + "grad_norm": 0.8442950772620854, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 23527 + }, + { + "epoch": 0.23528, + "grad_norm": 0.9629020451911949, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 23528 + }, + { + "epoch": 0.23529, + "grad_norm": 1.1076277992353016, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 23529 + }, + { + "epoch": 0.2353, + "grad_norm": 1.0262669318483837, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 23530 + }, + { + "epoch": 0.23531, + "grad_norm": 1.022044751703166, + "learning_rate": 0.003, + "loss": 4.0064, + "step": 23531 + }, + { + "epoch": 0.23532, + "grad_norm": 1.0333940674483422, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 23532 + }, + { + "epoch": 0.23533, + "grad_norm": 0.9994143479050802, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 23533 + }, + { + "epoch": 0.23534, + "grad_norm": 1.043355750470464, + "learning_rate": 0.003, + "loss": 4.082, + "step": 23534 + }, + { + "epoch": 0.23535, + "grad_norm": 0.8283532203989878, + "learning_rate": 0.003, + "loss": 4.066, + "step": 23535 + }, + { + "epoch": 0.23536, + "grad_norm": 0.7617125529874617, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 23536 + }, + { + "epoch": 0.23537, + "grad_norm": 0.9138806394157641, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 23537 + }, + { + "epoch": 0.23538, + "grad_norm": 0.903383173408642, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 23538 + }, + { + "epoch": 0.23539, + "grad_norm": 0.7883716512588659, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 23539 + }, + { + "epoch": 0.2354, + "grad_norm": 0.847778479790901, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 23540 + }, + { + "epoch": 0.23541, + "grad_norm": 0.8838225140860158, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 23541 + }, + { + "epoch": 0.23542, + "grad_norm": 0.9284506878331492, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 23542 + }, + { + "epoch": 0.23543, + "grad_norm": 0.8348687662052139, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 23543 + }, + { + "epoch": 0.23544, + "grad_norm": 0.8887999874833376, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 23544 + }, + { + "epoch": 0.23545, + "grad_norm": 0.9475586610532396, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 23545 + }, + { + "epoch": 0.23546, + "grad_norm": 0.7138223749279077, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 23546 + }, + { + "epoch": 0.23547, + "grad_norm": 0.5730568821844476, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 23547 + }, + { + "epoch": 0.23548, + "grad_norm": 0.6239287737464521, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 23548 + }, + { + "epoch": 0.23549, + "grad_norm": 0.7515950009395274, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 23549 + }, + { + "epoch": 0.2355, + "grad_norm": 1.0329720486858198, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 23550 + }, + { + "epoch": 0.23551, + "grad_norm": 1.209157868285168, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 23551 + }, + { + "epoch": 0.23552, + "grad_norm": 0.5837990966246265, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 23552 + }, + { + "epoch": 0.23553, + "grad_norm": 0.7274515567994064, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 23553 + }, + { + "epoch": 0.23554, + "grad_norm": 0.8619683797766035, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 23554 + }, + { + "epoch": 0.23555, + "grad_norm": 0.9586212709246147, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 23555 + }, + { + "epoch": 0.23556, + "grad_norm": 1.086459832909454, + "learning_rate": 0.003, + "loss": 4.024, + "step": 23556 + }, + { + "epoch": 0.23557, + "grad_norm": 0.8578811925102114, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 23557 + }, + { + "epoch": 0.23558, + "grad_norm": 0.761003404904438, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 23558 + }, + { + "epoch": 0.23559, + "grad_norm": 0.7650770056002918, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 23559 + }, + { + "epoch": 0.2356, + "grad_norm": 0.8010382266649182, + "learning_rate": 0.003, + "loss": 4.063, + "step": 23560 + }, + { + "epoch": 0.23561, + "grad_norm": 0.8819968910323468, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 23561 + }, + { + "epoch": 0.23562, + "grad_norm": 0.8370755739812418, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 23562 + }, + { + "epoch": 0.23563, + "grad_norm": 0.8851392694988899, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 23563 + }, + { + "epoch": 0.23564, + "grad_norm": 0.8623204350197202, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 23564 + }, + { + "epoch": 0.23565, + "grad_norm": 0.9262339707527562, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 23565 + }, + { + "epoch": 0.23566, + "grad_norm": 0.8875631855423897, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 23566 + }, + { + "epoch": 0.23567, + "grad_norm": 0.8618856193538932, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 23567 + }, + { + "epoch": 0.23568, + "grad_norm": 0.8412703707105913, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 23568 + }, + { + "epoch": 0.23569, + "grad_norm": 0.859622999156412, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 23569 + }, + { + "epoch": 0.2357, + "grad_norm": 0.8854793718788011, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 23570 + }, + { + "epoch": 0.23571, + "grad_norm": 0.9680686922638029, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 23571 + }, + { + "epoch": 0.23572, + "grad_norm": 0.956367222153997, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 23572 + }, + { + "epoch": 0.23573, + "grad_norm": 0.9875147630233635, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 23573 + }, + { + "epoch": 0.23574, + "grad_norm": 1.1374897107766333, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 23574 + }, + { + "epoch": 0.23575, + "grad_norm": 0.9361115171636641, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 23575 + }, + { + "epoch": 0.23576, + "grad_norm": 0.9646203889033469, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 23576 + }, + { + "epoch": 0.23577, + "grad_norm": 1.0001062256926352, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 23577 + }, + { + "epoch": 0.23578, + "grad_norm": 1.0797203602522731, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 23578 + }, + { + "epoch": 0.23579, + "grad_norm": 1.1802473448698165, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 23579 + }, + { + "epoch": 0.2358, + "grad_norm": 1.026226737154868, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 23580 + }, + { + "epoch": 0.23581, + "grad_norm": 0.8832815362627126, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 23581 + }, + { + "epoch": 0.23582, + "grad_norm": 0.7048146501859645, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 23582 + }, + { + "epoch": 0.23583, + "grad_norm": 0.6464982389973704, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 23583 + }, + { + "epoch": 0.23584, + "grad_norm": 0.6526469346115965, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 23584 + }, + { + "epoch": 0.23585, + "grad_norm": 0.6546173838286163, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 23585 + }, + { + "epoch": 0.23586, + "grad_norm": 0.6441171924240503, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 23586 + }, + { + "epoch": 0.23587, + "grad_norm": 0.6523256404459764, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 23587 + }, + { + "epoch": 0.23588, + "grad_norm": 0.6399602779107045, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 23588 + }, + { + "epoch": 0.23589, + "grad_norm": 0.6183238921054076, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 23589 + }, + { + "epoch": 0.2359, + "grad_norm": 0.635493406806837, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 23590 + }, + { + "epoch": 0.23591, + "grad_norm": 0.8146691276437147, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 23591 + }, + { + "epoch": 0.23592, + "grad_norm": 1.0951470006089996, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 23592 + }, + { + "epoch": 0.23593, + "grad_norm": 1.1583929813081055, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 23593 + }, + { + "epoch": 0.23594, + "grad_norm": 0.8712449056385658, + "learning_rate": 0.003, + "loss": 4.063, + "step": 23594 + }, + { + "epoch": 0.23595, + "grad_norm": 0.806471354872271, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 23595 + }, + { + "epoch": 0.23596, + "grad_norm": 0.7269455268062749, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 23596 + }, + { + "epoch": 0.23597, + "grad_norm": 0.7183578580805932, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 23597 + }, + { + "epoch": 0.23598, + "grad_norm": 0.7249367504658509, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 23598 + }, + { + "epoch": 0.23599, + "grad_norm": 0.8425052152058388, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 23599 + }, + { + "epoch": 0.236, + "grad_norm": 0.9742208637680512, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 23600 + }, + { + "epoch": 0.23601, + "grad_norm": 1.1461598131534463, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 23601 + }, + { + "epoch": 0.23602, + "grad_norm": 0.7875740516544314, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 23602 + }, + { + "epoch": 0.23603, + "grad_norm": 0.7230962926061553, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 23603 + }, + { + "epoch": 0.23604, + "grad_norm": 0.8479527183462586, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 23604 + }, + { + "epoch": 0.23605, + "grad_norm": 0.9041809157482524, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 23605 + }, + { + "epoch": 0.23606, + "grad_norm": 0.8391258589145818, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 23606 + }, + { + "epoch": 0.23607, + "grad_norm": 0.845458514490645, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 23607 + }, + { + "epoch": 0.23608, + "grad_norm": 0.9064973898408545, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 23608 + }, + { + "epoch": 0.23609, + "grad_norm": 0.9555197729355157, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 23609 + }, + { + "epoch": 0.2361, + "grad_norm": 0.9065372952649496, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 23610 + }, + { + "epoch": 0.23611, + "grad_norm": 0.9496673211055551, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 23611 + }, + { + "epoch": 0.23612, + "grad_norm": 1.1277771237357277, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 23612 + }, + { + "epoch": 0.23613, + "grad_norm": 1.2855285262865856, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 23613 + }, + { + "epoch": 0.23614, + "grad_norm": 1.0017365393907138, + "learning_rate": 0.003, + "loss": 4.033, + "step": 23614 + }, + { + "epoch": 0.23615, + "grad_norm": 0.9492447064677598, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 23615 + }, + { + "epoch": 0.23616, + "grad_norm": 1.0172177554368869, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 23616 + }, + { + "epoch": 0.23617, + "grad_norm": 1.0046308501484664, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 23617 + }, + { + "epoch": 0.23618, + "grad_norm": 0.8542539972453103, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 23618 + }, + { + "epoch": 0.23619, + "grad_norm": 0.7373111297814003, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 23619 + }, + { + "epoch": 0.2362, + "grad_norm": 0.8144671964369854, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 23620 + }, + { + "epoch": 0.23621, + "grad_norm": 0.9043474154302669, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 23621 + }, + { + "epoch": 0.23622, + "grad_norm": 0.9132045013295168, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 23622 + }, + { + "epoch": 0.23623, + "grad_norm": 0.9905635131882818, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 23623 + }, + { + "epoch": 0.23624, + "grad_norm": 1.094439400523337, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 23624 + }, + { + "epoch": 0.23625, + "grad_norm": 1.0767903637947664, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 23625 + }, + { + "epoch": 0.23626, + "grad_norm": 1.0162965675462243, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 23626 + }, + { + "epoch": 0.23627, + "grad_norm": 0.9850749670549553, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 23627 + }, + { + "epoch": 0.23628, + "grad_norm": 1.0212284158426461, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 23628 + }, + { + "epoch": 0.23629, + "grad_norm": 1.121964602804996, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 23629 + }, + { + "epoch": 0.2363, + "grad_norm": 0.9153124617988608, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 23630 + }, + { + "epoch": 0.23631, + "grad_norm": 0.8676822869985553, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 23631 + }, + { + "epoch": 0.23632, + "grad_norm": 0.7485787363934675, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 23632 + }, + { + "epoch": 0.23633, + "grad_norm": 0.7619567971358175, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 23633 + }, + { + "epoch": 0.23634, + "grad_norm": 0.8650858385414405, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 23634 + }, + { + "epoch": 0.23635, + "grad_norm": 0.9308350165470094, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 23635 + }, + { + "epoch": 0.23636, + "grad_norm": 1.228532982389462, + "learning_rate": 0.003, + "loss": 4.062, + "step": 23636 + }, + { + "epoch": 0.23637, + "grad_norm": 0.935256116006693, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 23637 + }, + { + "epoch": 0.23638, + "grad_norm": 0.8507425405458384, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 23638 + }, + { + "epoch": 0.23639, + "grad_norm": 0.8566502586079002, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 23639 + }, + { + "epoch": 0.2364, + "grad_norm": 0.7828140024110716, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 23640 + }, + { + "epoch": 0.23641, + "grad_norm": 0.7807380867926977, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 23641 + }, + { + "epoch": 0.23642, + "grad_norm": 0.7526310919570613, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 23642 + }, + { + "epoch": 0.23643, + "grad_norm": 0.7501699914736486, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 23643 + }, + { + "epoch": 0.23644, + "grad_norm": 0.9296431810122299, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 23644 + }, + { + "epoch": 0.23645, + "grad_norm": 0.8993955311518375, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 23645 + }, + { + "epoch": 0.23646, + "grad_norm": 0.8225115630681428, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 23646 + }, + { + "epoch": 0.23647, + "grad_norm": 0.9489010814155697, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 23647 + }, + { + "epoch": 0.23648, + "grad_norm": 1.0278446557178285, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 23648 + }, + { + "epoch": 0.23649, + "grad_norm": 0.8508752314092654, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 23649 + }, + { + "epoch": 0.2365, + "grad_norm": 0.7393204440650136, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 23650 + }, + { + "epoch": 0.23651, + "grad_norm": 0.7282365567618183, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 23651 + }, + { + "epoch": 0.23652, + "grad_norm": 0.7951331786723594, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 23652 + }, + { + "epoch": 0.23653, + "grad_norm": 0.8505496353603379, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 23653 + }, + { + "epoch": 0.23654, + "grad_norm": 0.9999252666650351, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 23654 + }, + { + "epoch": 0.23655, + "grad_norm": 1.1036654308243536, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 23655 + }, + { + "epoch": 0.23656, + "grad_norm": 0.8857862620759969, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 23656 + }, + { + "epoch": 0.23657, + "grad_norm": 1.0349756791578346, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 23657 + }, + { + "epoch": 0.23658, + "grad_norm": 0.8515051639875577, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 23658 + }, + { + "epoch": 0.23659, + "grad_norm": 0.7045717320814407, + "learning_rate": 0.003, + "loss": 4.041, + "step": 23659 + }, + { + "epoch": 0.2366, + "grad_norm": 0.6134568239393103, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 23660 + }, + { + "epoch": 0.23661, + "grad_norm": 0.7094646545527582, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 23661 + }, + { + "epoch": 0.23662, + "grad_norm": 0.8125165557695597, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 23662 + }, + { + "epoch": 0.23663, + "grad_norm": 0.9848910656251846, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 23663 + }, + { + "epoch": 0.23664, + "grad_norm": 1.0148514717454828, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 23664 + }, + { + "epoch": 0.23665, + "grad_norm": 1.0091959955395162, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 23665 + }, + { + "epoch": 0.23666, + "grad_norm": 0.8757007107412164, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 23666 + }, + { + "epoch": 0.23667, + "grad_norm": 0.8883704392602975, + "learning_rate": 0.003, + "loss": 4.055, + "step": 23667 + }, + { + "epoch": 0.23668, + "grad_norm": 1.0260382100750878, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 23668 + }, + { + "epoch": 0.23669, + "grad_norm": 1.044053788050421, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 23669 + }, + { + "epoch": 0.2367, + "grad_norm": 1.0829679192764872, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 23670 + }, + { + "epoch": 0.23671, + "grad_norm": 1.0582751893335969, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 23671 + }, + { + "epoch": 0.23672, + "grad_norm": 0.9141838014885043, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 23672 + }, + { + "epoch": 0.23673, + "grad_norm": 0.9379376997659608, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 23673 + }, + { + "epoch": 0.23674, + "grad_norm": 1.1124835008836031, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 23674 + }, + { + "epoch": 0.23675, + "grad_norm": 0.9916248203041355, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 23675 + }, + { + "epoch": 0.23676, + "grad_norm": 0.864929601268841, + "learning_rate": 0.003, + "loss": 4.084, + "step": 23676 + }, + { + "epoch": 0.23677, + "grad_norm": 0.8286947397229656, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 23677 + }, + { + "epoch": 0.23678, + "grad_norm": 0.7772779771133362, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 23678 + }, + { + "epoch": 0.23679, + "grad_norm": 0.659533718327479, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 23679 + }, + { + "epoch": 0.2368, + "grad_norm": 0.7120360805479395, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 23680 + }, + { + "epoch": 0.23681, + "grad_norm": 0.6575060339444038, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 23681 + }, + { + "epoch": 0.23682, + "grad_norm": 0.7034512232485463, + "learning_rate": 0.003, + "loss": 4.017, + "step": 23682 + }, + { + "epoch": 0.23683, + "grad_norm": 0.7854979227953794, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 23683 + }, + { + "epoch": 0.23684, + "grad_norm": 0.8640903386646525, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 23684 + }, + { + "epoch": 0.23685, + "grad_norm": 1.0013649370730158, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 23685 + }, + { + "epoch": 0.23686, + "grad_norm": 1.096318757466863, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 23686 + }, + { + "epoch": 0.23687, + "grad_norm": 0.8214280798521816, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 23687 + }, + { + "epoch": 0.23688, + "grad_norm": 0.7965584445237289, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 23688 + }, + { + "epoch": 0.23689, + "grad_norm": 0.7685865849769362, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 23689 + }, + { + "epoch": 0.2369, + "grad_norm": 0.6893280561013264, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 23690 + }, + { + "epoch": 0.23691, + "grad_norm": 0.6912213761040542, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 23691 + }, + { + "epoch": 0.23692, + "grad_norm": 0.7372370189926273, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 23692 + }, + { + "epoch": 0.23693, + "grad_norm": 0.8204260436094726, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 23693 + }, + { + "epoch": 0.23694, + "grad_norm": 0.9485968847610363, + "learning_rate": 0.003, + "loss": 4.013, + "step": 23694 + }, + { + "epoch": 0.23695, + "grad_norm": 1.1977787093846215, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 23695 + }, + { + "epoch": 0.23696, + "grad_norm": 1.0331212679192867, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 23696 + }, + { + "epoch": 0.23697, + "grad_norm": 0.886799126891739, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 23697 + }, + { + "epoch": 0.23698, + "grad_norm": 0.7945548226833384, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 23698 + }, + { + "epoch": 0.23699, + "grad_norm": 0.8262667902295605, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 23699 + }, + { + "epoch": 0.237, + "grad_norm": 0.7952850981510156, + "learning_rate": 0.003, + "loss": 4.028, + "step": 23700 + }, + { + "epoch": 0.23701, + "grad_norm": 0.9426131438109266, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 23701 + }, + { + "epoch": 0.23702, + "grad_norm": 1.2186326294816368, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 23702 + }, + { + "epoch": 0.23703, + "grad_norm": 0.9093822433317033, + "learning_rate": 0.003, + "loss": 4.042, + "step": 23703 + }, + { + "epoch": 0.23704, + "grad_norm": 0.924778055129946, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 23704 + }, + { + "epoch": 0.23705, + "grad_norm": 0.9605397122456875, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 23705 + }, + { + "epoch": 0.23706, + "grad_norm": 0.9281521026179532, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 23706 + }, + { + "epoch": 0.23707, + "grad_norm": 0.918885559322832, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 23707 + }, + { + "epoch": 0.23708, + "grad_norm": 0.9259313446023939, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 23708 + }, + { + "epoch": 0.23709, + "grad_norm": 0.8491193958148544, + "learning_rate": 0.003, + "loss": 4.062, + "step": 23709 + }, + { + "epoch": 0.2371, + "grad_norm": 0.9787414510707803, + "learning_rate": 0.003, + "loss": 4.042, + "step": 23710 + }, + { + "epoch": 0.23711, + "grad_norm": 1.0512576427281033, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 23711 + }, + { + "epoch": 0.23712, + "grad_norm": 1.1442441444524603, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 23712 + }, + { + "epoch": 0.23713, + "grad_norm": 0.7886432661186354, + "learning_rate": 0.003, + "loss": 4.046, + "step": 23713 + }, + { + "epoch": 0.23714, + "grad_norm": 0.7192607059020641, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 23714 + }, + { + "epoch": 0.23715, + "grad_norm": 0.6803307217059974, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 23715 + }, + { + "epoch": 0.23716, + "grad_norm": 0.7451331889798255, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 23716 + }, + { + "epoch": 0.23717, + "grad_norm": 0.9912150629909722, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 23717 + }, + { + "epoch": 0.23718, + "grad_norm": 1.142803181117511, + "learning_rate": 0.003, + "loss": 4.075, + "step": 23718 + }, + { + "epoch": 0.23719, + "grad_norm": 0.9536266113914136, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 23719 + }, + { + "epoch": 0.2372, + "grad_norm": 0.8318571044524352, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 23720 + }, + { + "epoch": 0.23721, + "grad_norm": 0.7369039646820076, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 23721 + }, + { + "epoch": 0.23722, + "grad_norm": 0.7186204991531209, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 23722 + }, + { + "epoch": 0.23723, + "grad_norm": 0.7303515590228324, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 23723 + }, + { + "epoch": 0.23724, + "grad_norm": 0.6972734523106194, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 23724 + }, + { + "epoch": 0.23725, + "grad_norm": 0.7479317547096554, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 23725 + }, + { + "epoch": 0.23726, + "grad_norm": 0.78785194481346, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 23726 + }, + { + "epoch": 0.23727, + "grad_norm": 0.9262905918361009, + "learning_rate": 0.003, + "loss": 4.048, + "step": 23727 + }, + { + "epoch": 0.23728, + "grad_norm": 1.183995877661308, + "learning_rate": 0.003, + "loss": 4.0043, + "step": 23728 + }, + { + "epoch": 0.23729, + "grad_norm": 0.9529623987101564, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 23729 + }, + { + "epoch": 0.2373, + "grad_norm": 0.8408421205102627, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 23730 + }, + { + "epoch": 0.23731, + "grad_norm": 0.9411302961024143, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 23731 + }, + { + "epoch": 0.23732, + "grad_norm": 1.1415803404196334, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 23732 + }, + { + "epoch": 0.23733, + "grad_norm": 0.9730155583799385, + "learning_rate": 0.003, + "loss": 4.073, + "step": 23733 + }, + { + "epoch": 0.23734, + "grad_norm": 0.9102850065374251, + "learning_rate": 0.003, + "loss": 4.046, + "step": 23734 + }, + { + "epoch": 0.23735, + "grad_norm": 1.1059069322995851, + "learning_rate": 0.003, + "loss": 4.057, + "step": 23735 + }, + { + "epoch": 0.23736, + "grad_norm": 1.0167701587712585, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 23736 + }, + { + "epoch": 0.23737, + "grad_norm": 0.9028611539175332, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 23737 + }, + { + "epoch": 0.23738, + "grad_norm": 0.8216560376710537, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 23738 + }, + { + "epoch": 0.23739, + "grad_norm": 0.8230191641359648, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 23739 + }, + { + "epoch": 0.2374, + "grad_norm": 0.9676879124111347, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 23740 + }, + { + "epoch": 0.23741, + "grad_norm": 1.101932757731516, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 23741 + }, + { + "epoch": 0.23742, + "grad_norm": 0.9240266059580302, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 23742 + }, + { + "epoch": 0.23743, + "grad_norm": 0.9777630766475063, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 23743 + }, + { + "epoch": 0.23744, + "grad_norm": 0.9532760535159879, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 23744 + }, + { + "epoch": 0.23745, + "grad_norm": 0.9578988240507131, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 23745 + }, + { + "epoch": 0.23746, + "grad_norm": 0.948282548513073, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 23746 + }, + { + "epoch": 0.23747, + "grad_norm": 1.1266205813666343, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 23747 + }, + { + "epoch": 0.23748, + "grad_norm": 0.9455660471383677, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 23748 + }, + { + "epoch": 0.23749, + "grad_norm": 1.0556565216795553, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 23749 + }, + { + "epoch": 0.2375, + "grad_norm": 1.2075693151324032, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 23750 + }, + { + "epoch": 0.23751, + "grad_norm": 0.9355026323387243, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 23751 + }, + { + "epoch": 0.23752, + "grad_norm": 0.8740309165215498, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 23752 + }, + { + "epoch": 0.23753, + "grad_norm": 0.8128975080080184, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 23753 + }, + { + "epoch": 0.23754, + "grad_norm": 0.7540731723693127, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 23754 + }, + { + "epoch": 0.23755, + "grad_norm": 0.6966786043359898, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 23755 + }, + { + "epoch": 0.23756, + "grad_norm": 0.7035375621121372, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 23756 + }, + { + "epoch": 0.23757, + "grad_norm": 0.7413219939164101, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 23757 + }, + { + "epoch": 0.23758, + "grad_norm": 0.8082120029716918, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 23758 + }, + { + "epoch": 0.23759, + "grad_norm": 1.0040605348583993, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 23759 + }, + { + "epoch": 0.2376, + "grad_norm": 1.1156474671494216, + "learning_rate": 0.003, + "loss": 4.046, + "step": 23760 + }, + { + "epoch": 0.23761, + "grad_norm": 0.8940647208220829, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 23761 + }, + { + "epoch": 0.23762, + "grad_norm": 0.8636884557517225, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 23762 + }, + { + "epoch": 0.23763, + "grad_norm": 0.8059181598095077, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 23763 + }, + { + "epoch": 0.23764, + "grad_norm": 0.7359689937991982, + "learning_rate": 0.003, + "loss": 4.0096, + "step": 23764 + }, + { + "epoch": 0.23765, + "grad_norm": 0.7052047258115853, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 23765 + }, + { + "epoch": 0.23766, + "grad_norm": 0.7053016619129944, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 23766 + }, + { + "epoch": 0.23767, + "grad_norm": 0.634681941680801, + "learning_rate": 0.003, + "loss": 4.014, + "step": 23767 + }, + { + "epoch": 0.23768, + "grad_norm": 0.6379477048246032, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 23768 + }, + { + "epoch": 0.23769, + "grad_norm": 0.7165958126511536, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 23769 + }, + { + "epoch": 0.2377, + "grad_norm": 0.8059448359700745, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 23770 + }, + { + "epoch": 0.23771, + "grad_norm": 0.9165908723360074, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 23771 + }, + { + "epoch": 0.23772, + "grad_norm": 1.1363440354514454, + "learning_rate": 0.003, + "loss": 4.027, + "step": 23772 + }, + { + "epoch": 0.23773, + "grad_norm": 0.9435709742458889, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 23773 + }, + { + "epoch": 0.23774, + "grad_norm": 0.9853652896027093, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 23774 + }, + { + "epoch": 0.23775, + "grad_norm": 0.986838507451984, + "learning_rate": 0.003, + "loss": 4.049, + "step": 23775 + }, + { + "epoch": 0.23776, + "grad_norm": 0.8955411308927379, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 23776 + }, + { + "epoch": 0.23777, + "grad_norm": 0.8387329218582318, + "learning_rate": 0.003, + "loss": 4.049, + "step": 23777 + }, + { + "epoch": 0.23778, + "grad_norm": 0.9356896828060617, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 23778 + }, + { + "epoch": 0.23779, + "grad_norm": 1.0917706522165258, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 23779 + }, + { + "epoch": 0.2378, + "grad_norm": 0.9714825727349592, + "learning_rate": 0.003, + "loss": 4.0017, + "step": 23780 + }, + { + "epoch": 0.23781, + "grad_norm": 0.9462718964720743, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 23781 + }, + { + "epoch": 0.23782, + "grad_norm": 0.9073306152103564, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 23782 + }, + { + "epoch": 0.23783, + "grad_norm": 0.7049492342336101, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 23783 + }, + { + "epoch": 0.23784, + "grad_norm": 0.6951098464330825, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 23784 + }, + { + "epoch": 0.23785, + "grad_norm": 0.7375645933427062, + "learning_rate": 0.003, + "loss": 4.021, + "step": 23785 + }, + { + "epoch": 0.23786, + "grad_norm": 0.8769100255082033, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 23786 + }, + { + "epoch": 0.23787, + "grad_norm": 0.9868474641372071, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 23787 + }, + { + "epoch": 0.23788, + "grad_norm": 1.044571696665442, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 23788 + }, + { + "epoch": 0.23789, + "grad_norm": 1.1026872352115349, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 23789 + }, + { + "epoch": 0.2379, + "grad_norm": 0.9138550452245233, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 23790 + }, + { + "epoch": 0.23791, + "grad_norm": 1.0640233372869399, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 23791 + }, + { + "epoch": 0.23792, + "grad_norm": 1.120476918971707, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 23792 + }, + { + "epoch": 0.23793, + "grad_norm": 0.9535255213492942, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 23793 + }, + { + "epoch": 0.23794, + "grad_norm": 0.9982005136533192, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 23794 + }, + { + "epoch": 0.23795, + "grad_norm": 0.9811287827583272, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 23795 + }, + { + "epoch": 0.23796, + "grad_norm": 0.8825797221493334, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 23796 + }, + { + "epoch": 0.23797, + "grad_norm": 0.8786571476249699, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 23797 + }, + { + "epoch": 0.23798, + "grad_norm": 0.8217438953646821, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 23798 + }, + { + "epoch": 0.23799, + "grad_norm": 0.8241426772628611, + "learning_rate": 0.003, + "loss": 4.075, + "step": 23799 + }, + { + "epoch": 0.238, + "grad_norm": 0.806284747066766, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 23800 + }, + { + "epoch": 0.23801, + "grad_norm": 0.7699279123731254, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 23801 + }, + { + "epoch": 0.23802, + "grad_norm": 0.7793582936417139, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 23802 + }, + { + "epoch": 0.23803, + "grad_norm": 0.9858019055772262, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 23803 + }, + { + "epoch": 0.23804, + "grad_norm": 1.0210858876720903, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 23804 + }, + { + "epoch": 0.23805, + "grad_norm": 0.894510215298178, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 23805 + }, + { + "epoch": 0.23806, + "grad_norm": 1.0842775381717982, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 23806 + }, + { + "epoch": 0.23807, + "grad_norm": 1.1790647380800272, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 23807 + }, + { + "epoch": 0.23808, + "grad_norm": 0.975198201371548, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 23808 + }, + { + "epoch": 0.23809, + "grad_norm": 0.9448235297635766, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 23809 + }, + { + "epoch": 0.2381, + "grad_norm": 0.8309856053087825, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 23810 + }, + { + "epoch": 0.23811, + "grad_norm": 0.7431981536876313, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 23811 + }, + { + "epoch": 0.23812, + "grad_norm": 0.6693241524162311, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 23812 + }, + { + "epoch": 0.23813, + "grad_norm": 0.7051333290537272, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 23813 + }, + { + "epoch": 0.23814, + "grad_norm": 0.6653198217036655, + "learning_rate": 0.003, + "loss": 4.075, + "step": 23814 + }, + { + "epoch": 0.23815, + "grad_norm": 0.5650805771726254, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 23815 + }, + { + "epoch": 0.23816, + "grad_norm": 0.5720528675378561, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 23816 + }, + { + "epoch": 0.23817, + "grad_norm": 0.55667001135488, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 23817 + }, + { + "epoch": 0.23818, + "grad_norm": 0.5688252003169696, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 23818 + }, + { + "epoch": 0.23819, + "grad_norm": 0.5770735792760818, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 23819 + }, + { + "epoch": 0.2382, + "grad_norm": 0.6415005448927382, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 23820 + }, + { + "epoch": 0.23821, + "grad_norm": 0.7792790899285379, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 23821 + }, + { + "epoch": 0.23822, + "grad_norm": 1.2401253810872785, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 23822 + }, + { + "epoch": 0.23823, + "grad_norm": 0.9654666156907661, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 23823 + }, + { + "epoch": 0.23824, + "grad_norm": 0.9530324568754521, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 23824 + }, + { + "epoch": 0.23825, + "grad_norm": 1.1707237383671287, + "learning_rate": 0.003, + "loss": 4.043, + "step": 23825 + }, + { + "epoch": 0.23826, + "grad_norm": 0.8708757840944379, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 23826 + }, + { + "epoch": 0.23827, + "grad_norm": 0.8306739795853243, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 23827 + }, + { + "epoch": 0.23828, + "grad_norm": 0.8041387911269132, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 23828 + }, + { + "epoch": 0.23829, + "grad_norm": 0.7870930722708536, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 23829 + }, + { + "epoch": 0.2383, + "grad_norm": 0.8231782286884456, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 23830 + }, + { + "epoch": 0.23831, + "grad_norm": 0.8328594937417843, + "learning_rate": 0.003, + "loss": 4.034, + "step": 23831 + }, + { + "epoch": 0.23832, + "grad_norm": 0.8367398697427005, + "learning_rate": 0.003, + "loss": 4.046, + "step": 23832 + }, + { + "epoch": 0.23833, + "grad_norm": 0.8869077275210392, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 23833 + }, + { + "epoch": 0.23834, + "grad_norm": 0.9771233275735915, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 23834 + }, + { + "epoch": 0.23835, + "grad_norm": 0.9445842169776447, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 23835 + }, + { + "epoch": 0.23836, + "grad_norm": 1.0796517589401968, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 23836 + }, + { + "epoch": 0.23837, + "grad_norm": 0.9999686768476472, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 23837 + }, + { + "epoch": 0.23838, + "grad_norm": 1.0188576010165489, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 23838 + }, + { + "epoch": 0.23839, + "grad_norm": 1.0188012835907987, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 23839 + }, + { + "epoch": 0.2384, + "grad_norm": 0.9035928233198262, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 23840 + }, + { + "epoch": 0.23841, + "grad_norm": 0.9449464339944221, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 23841 + }, + { + "epoch": 0.23842, + "grad_norm": 1.108643556061732, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 23842 + }, + { + "epoch": 0.23843, + "grad_norm": 0.8717757225857686, + "learning_rate": 0.003, + "loss": 4.068, + "step": 23843 + }, + { + "epoch": 0.23844, + "grad_norm": 0.8666097149021877, + "learning_rate": 0.003, + "loss": 4.072, + "step": 23844 + }, + { + "epoch": 0.23845, + "grad_norm": 1.0759369814913384, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 23845 + }, + { + "epoch": 0.23846, + "grad_norm": 1.2178127552228601, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 23846 + }, + { + "epoch": 0.23847, + "grad_norm": 0.8303896574997529, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 23847 + }, + { + "epoch": 0.23848, + "grad_norm": 0.8042276507324703, + "learning_rate": 0.003, + "loss": 4.031, + "step": 23848 + }, + { + "epoch": 0.23849, + "grad_norm": 0.9313481058300013, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 23849 + }, + { + "epoch": 0.2385, + "grad_norm": 1.1634654039350076, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 23850 + }, + { + "epoch": 0.23851, + "grad_norm": 0.8514752157085883, + "learning_rate": 0.003, + "loss": 4.073, + "step": 23851 + }, + { + "epoch": 0.23852, + "grad_norm": 0.7076461178687795, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 23852 + }, + { + "epoch": 0.23853, + "grad_norm": 0.7503145225617919, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 23853 + }, + { + "epoch": 0.23854, + "grad_norm": 0.7202968172791401, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 23854 + }, + { + "epoch": 0.23855, + "grad_norm": 0.719678691366509, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 23855 + }, + { + "epoch": 0.23856, + "grad_norm": 0.8275547809991393, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 23856 + }, + { + "epoch": 0.23857, + "grad_norm": 0.9871145001597039, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 23857 + }, + { + "epoch": 0.23858, + "grad_norm": 1.115484560260526, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 23858 + }, + { + "epoch": 0.23859, + "grad_norm": 0.8518385877430924, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 23859 + }, + { + "epoch": 0.2386, + "grad_norm": 0.795377862230423, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 23860 + }, + { + "epoch": 0.23861, + "grad_norm": 0.8530868156680435, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 23861 + }, + { + "epoch": 0.23862, + "grad_norm": 0.854205737404698, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 23862 + }, + { + "epoch": 0.23863, + "grad_norm": 0.8895193214946078, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 23863 + }, + { + "epoch": 0.23864, + "grad_norm": 0.8366774362897966, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 23864 + }, + { + "epoch": 0.23865, + "grad_norm": 0.8869314133643494, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 23865 + }, + { + "epoch": 0.23866, + "grad_norm": 0.7409505229499392, + "learning_rate": 0.003, + "loss": 4.043, + "step": 23866 + }, + { + "epoch": 0.23867, + "grad_norm": 0.6710412370291577, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 23867 + }, + { + "epoch": 0.23868, + "grad_norm": 0.6744115315788555, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 23868 + }, + { + "epoch": 0.23869, + "grad_norm": 0.9195077068497159, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 23869 + }, + { + "epoch": 0.2387, + "grad_norm": 1.2714772447849427, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 23870 + }, + { + "epoch": 0.23871, + "grad_norm": 0.6770058295215355, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 23871 + }, + { + "epoch": 0.23872, + "grad_norm": 0.6309307076653112, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 23872 + }, + { + "epoch": 0.23873, + "grad_norm": 0.5990751958506392, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 23873 + }, + { + "epoch": 0.23874, + "grad_norm": 0.6680844834978303, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 23874 + }, + { + "epoch": 0.23875, + "grad_norm": 0.726616253775967, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 23875 + }, + { + "epoch": 0.23876, + "grad_norm": 0.6410081277309755, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 23876 + }, + { + "epoch": 0.23877, + "grad_norm": 0.6071173451199402, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 23877 + }, + { + "epoch": 0.23878, + "grad_norm": 0.6686577462701669, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 23878 + }, + { + "epoch": 0.23879, + "grad_norm": 0.7575692661423011, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 23879 + }, + { + "epoch": 0.2388, + "grad_norm": 1.0635534878555628, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 23880 + }, + { + "epoch": 0.23881, + "grad_norm": 1.282914970321341, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 23881 + }, + { + "epoch": 0.23882, + "grad_norm": 0.807510943337494, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 23882 + }, + { + "epoch": 0.23883, + "grad_norm": 0.7438756109729677, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 23883 + }, + { + "epoch": 0.23884, + "grad_norm": 0.7883902967380124, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 23884 + }, + { + "epoch": 0.23885, + "grad_norm": 0.8745080907165037, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 23885 + }, + { + "epoch": 0.23886, + "grad_norm": 0.992019050134065, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 23886 + }, + { + "epoch": 0.23887, + "grad_norm": 1.079792889903227, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 23887 + }, + { + "epoch": 0.23888, + "grad_norm": 1.1031556788350718, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 23888 + }, + { + "epoch": 0.23889, + "grad_norm": 1.163061635381459, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 23889 + }, + { + "epoch": 0.2389, + "grad_norm": 0.977647611777324, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 23890 + }, + { + "epoch": 0.23891, + "grad_norm": 0.9047700430063531, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 23891 + }, + { + "epoch": 0.23892, + "grad_norm": 1.10505228991655, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 23892 + }, + { + "epoch": 0.23893, + "grad_norm": 0.8821829797179381, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 23893 + }, + { + "epoch": 0.23894, + "grad_norm": 0.7941149912615949, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 23894 + }, + { + "epoch": 0.23895, + "grad_norm": 0.8697131405624108, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 23895 + }, + { + "epoch": 0.23896, + "grad_norm": 0.902187412970047, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 23896 + }, + { + "epoch": 0.23897, + "grad_norm": 1.050734562136658, + "learning_rate": 0.003, + "loss": 4.056, + "step": 23897 + }, + { + "epoch": 0.23898, + "grad_norm": 1.225133759485856, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 23898 + }, + { + "epoch": 0.23899, + "grad_norm": 0.7578595119970893, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 23899 + }, + { + "epoch": 0.239, + "grad_norm": 0.6591358689957916, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 23900 + }, + { + "epoch": 0.23901, + "grad_norm": 0.7213696754680423, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 23901 + }, + { + "epoch": 0.23902, + "grad_norm": 0.8941360339181375, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 23902 + }, + { + "epoch": 0.23903, + "grad_norm": 1.22272343566739, + "learning_rate": 0.003, + "loss": 4.084, + "step": 23903 + }, + { + "epoch": 0.23904, + "grad_norm": 0.8459556551219984, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 23904 + }, + { + "epoch": 0.23905, + "grad_norm": 0.6991213623821504, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 23905 + }, + { + "epoch": 0.23906, + "grad_norm": 0.6931773850988112, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 23906 + }, + { + "epoch": 0.23907, + "grad_norm": 0.7060989251339346, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 23907 + }, + { + "epoch": 0.23908, + "grad_norm": 0.6929639140943131, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 23908 + }, + { + "epoch": 0.23909, + "grad_norm": 0.7266225244111828, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 23909 + }, + { + "epoch": 0.2391, + "grad_norm": 0.8343071444496308, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 23910 + }, + { + "epoch": 0.23911, + "grad_norm": 0.9186558032919087, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 23911 + }, + { + "epoch": 0.23912, + "grad_norm": 1.1568098248618708, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 23912 + }, + { + "epoch": 0.23913, + "grad_norm": 1.1765447822534711, + "learning_rate": 0.003, + "loss": 4.039, + "step": 23913 + }, + { + "epoch": 0.23914, + "grad_norm": 0.888065067566215, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 23914 + }, + { + "epoch": 0.23915, + "grad_norm": 0.701921798296881, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 23915 + }, + { + "epoch": 0.23916, + "grad_norm": 0.7741549141027102, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 23916 + }, + { + "epoch": 0.23917, + "grad_norm": 0.7750561869548219, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 23917 + }, + { + "epoch": 0.23918, + "grad_norm": 0.7379412014013816, + "learning_rate": 0.003, + "loss": 4.028, + "step": 23918 + }, + { + "epoch": 0.23919, + "grad_norm": 0.667436205126358, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 23919 + }, + { + "epoch": 0.2392, + "grad_norm": 0.7006097258389764, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 23920 + }, + { + "epoch": 0.23921, + "grad_norm": 0.7199153764105264, + "learning_rate": 0.003, + "loss": 4.044, + "step": 23921 + }, + { + "epoch": 0.23922, + "grad_norm": 0.95338646918464, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 23922 + }, + { + "epoch": 0.23923, + "grad_norm": 1.1177306161222953, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 23923 + }, + { + "epoch": 0.23924, + "grad_norm": 0.8516971319288399, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 23924 + }, + { + "epoch": 0.23925, + "grad_norm": 0.7195189356447421, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 23925 + }, + { + "epoch": 0.23926, + "grad_norm": 0.8560185291034766, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 23926 + }, + { + "epoch": 0.23927, + "grad_norm": 0.9454836899241144, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 23927 + }, + { + "epoch": 0.23928, + "grad_norm": 0.8588839091992811, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 23928 + }, + { + "epoch": 0.23929, + "grad_norm": 0.8989397071990942, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 23929 + }, + { + "epoch": 0.2393, + "grad_norm": 0.943677117321925, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 23930 + }, + { + "epoch": 0.23931, + "grad_norm": 0.9073316878593287, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 23931 + }, + { + "epoch": 0.23932, + "grad_norm": 0.9422686652566484, + "learning_rate": 0.003, + "loss": 4.051, + "step": 23932 + }, + { + "epoch": 0.23933, + "grad_norm": 1.1831678230488505, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 23933 + }, + { + "epoch": 0.23934, + "grad_norm": 1.0941226389408711, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 23934 + }, + { + "epoch": 0.23935, + "grad_norm": 0.8278199637940014, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 23935 + }, + { + "epoch": 0.23936, + "grad_norm": 0.8303696103921486, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 23936 + }, + { + "epoch": 0.23937, + "grad_norm": 0.9464801771631769, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 23937 + }, + { + "epoch": 0.23938, + "grad_norm": 1.0895075966974028, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 23938 + }, + { + "epoch": 0.23939, + "grad_norm": 1.062865093566268, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 23939 + }, + { + "epoch": 0.2394, + "grad_norm": 1.0101956880586989, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 23940 + }, + { + "epoch": 0.23941, + "grad_norm": 1.0039406989902235, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 23941 + }, + { + "epoch": 0.23942, + "grad_norm": 1.0049457113166653, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 23942 + }, + { + "epoch": 0.23943, + "grad_norm": 1.1890442050350323, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 23943 + }, + { + "epoch": 0.23944, + "grad_norm": 0.9919631383837988, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 23944 + }, + { + "epoch": 0.23945, + "grad_norm": 1.031884376874631, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 23945 + }, + { + "epoch": 0.23946, + "grad_norm": 1.0281850575674434, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 23946 + }, + { + "epoch": 0.23947, + "grad_norm": 0.832043122470827, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 23947 + }, + { + "epoch": 0.23948, + "grad_norm": 0.8742482864584319, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 23948 + }, + { + "epoch": 0.23949, + "grad_norm": 0.7322700338963198, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 23949 + }, + { + "epoch": 0.2395, + "grad_norm": 0.7692639399506629, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 23950 + }, + { + "epoch": 0.23951, + "grad_norm": 0.9281173395562962, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 23951 + }, + { + "epoch": 0.23952, + "grad_norm": 1.1837769876789066, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 23952 + }, + { + "epoch": 0.23953, + "grad_norm": 0.8848549376178267, + "learning_rate": 0.003, + "loss": 4.063, + "step": 23953 + }, + { + "epoch": 0.23954, + "grad_norm": 0.9993075832636185, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 23954 + }, + { + "epoch": 0.23955, + "grad_norm": 1.1526762999632307, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 23955 + }, + { + "epoch": 0.23956, + "grad_norm": 0.8215830345725534, + "learning_rate": 0.003, + "loss": 4.086, + "step": 23956 + }, + { + "epoch": 0.23957, + "grad_norm": 0.779914910608157, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 23957 + }, + { + "epoch": 0.23958, + "grad_norm": 0.8308680009635977, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 23958 + }, + { + "epoch": 0.23959, + "grad_norm": 0.8479161618904443, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 23959 + }, + { + "epoch": 0.2396, + "grad_norm": 0.9078884902641015, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 23960 + }, + { + "epoch": 0.23961, + "grad_norm": 0.9268873818269759, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 23961 + }, + { + "epoch": 0.23962, + "grad_norm": 1.0509853299007614, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 23962 + }, + { + "epoch": 0.23963, + "grad_norm": 1.1278199704024026, + "learning_rate": 0.003, + "loss": 4.028, + "step": 23963 + }, + { + "epoch": 0.23964, + "grad_norm": 1.1672871698584926, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 23964 + }, + { + "epoch": 0.23965, + "grad_norm": 0.8818154374994653, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 23965 + }, + { + "epoch": 0.23966, + "grad_norm": 0.795286769845189, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 23966 + }, + { + "epoch": 0.23967, + "grad_norm": 0.7486297084821716, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 23967 + }, + { + "epoch": 0.23968, + "grad_norm": 0.763368071363066, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 23968 + }, + { + "epoch": 0.23969, + "grad_norm": 0.8735000690838187, + "learning_rate": 0.003, + "loss": 4.054, + "step": 23969 + }, + { + "epoch": 0.2397, + "grad_norm": 0.8884296537657199, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 23970 + }, + { + "epoch": 0.23971, + "grad_norm": 0.8925903222962948, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 23971 + }, + { + "epoch": 0.23972, + "grad_norm": 1.0062558089726468, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 23972 + }, + { + "epoch": 0.23973, + "grad_norm": 1.0104350210104025, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 23973 + }, + { + "epoch": 0.23974, + "grad_norm": 0.9477021336502717, + "learning_rate": 0.003, + "loss": 4.068, + "step": 23974 + }, + { + "epoch": 0.23975, + "grad_norm": 0.9884272259898024, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 23975 + }, + { + "epoch": 0.23976, + "grad_norm": 1.0415007609763645, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 23976 + }, + { + "epoch": 0.23977, + "grad_norm": 0.9913468290844657, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 23977 + }, + { + "epoch": 0.23978, + "grad_norm": 0.990606825698148, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 23978 + }, + { + "epoch": 0.23979, + "grad_norm": 0.9052510115102118, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 23979 + }, + { + "epoch": 0.2398, + "grad_norm": 0.8807073781520176, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 23980 + }, + { + "epoch": 0.23981, + "grad_norm": 0.8737174013016102, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 23981 + }, + { + "epoch": 0.23982, + "grad_norm": 0.9016961800169254, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 23982 + }, + { + "epoch": 0.23983, + "grad_norm": 0.8186522578905069, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 23983 + }, + { + "epoch": 0.23984, + "grad_norm": 0.687195031125146, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 23984 + }, + { + "epoch": 0.23985, + "grad_norm": 0.6848577582069004, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 23985 + }, + { + "epoch": 0.23986, + "grad_norm": 0.7811671403393963, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 23986 + }, + { + "epoch": 0.23987, + "grad_norm": 0.9379284798848163, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 23987 + }, + { + "epoch": 0.23988, + "grad_norm": 1.2349473219242542, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 23988 + }, + { + "epoch": 0.23989, + "grad_norm": 0.8519903952432376, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 23989 + }, + { + "epoch": 0.2399, + "grad_norm": 0.9152604046045248, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 23990 + }, + { + "epoch": 0.23991, + "grad_norm": 0.7908477818215786, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 23991 + }, + { + "epoch": 0.23992, + "grad_norm": 0.6752846025344641, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 23992 + }, + { + "epoch": 0.23993, + "grad_norm": 0.6130458187944698, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 23993 + }, + { + "epoch": 0.23994, + "grad_norm": 0.7017133338001408, + "learning_rate": 0.003, + "loss": 4.051, + "step": 23994 + }, + { + "epoch": 0.23995, + "grad_norm": 0.9462146215640923, + "learning_rate": 0.003, + "loss": 4.036, + "step": 23995 + }, + { + "epoch": 0.23996, + "grad_norm": 1.2356017573618747, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 23996 + }, + { + "epoch": 0.23997, + "grad_norm": 0.8376309062213451, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 23997 + }, + { + "epoch": 0.23998, + "grad_norm": 0.7806909865944034, + "learning_rate": 0.003, + "loss": 4.047, + "step": 23998 + }, + { + "epoch": 0.23999, + "grad_norm": 0.7623008924102859, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 23999 + }, + { + "epoch": 0.24, + "grad_norm": 0.7429570086202645, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 24000 + }, + { + "epoch": 0.24001, + "grad_norm": 0.891403679073063, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 24001 + }, + { + "epoch": 0.24002, + "grad_norm": 1.0607137502136506, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 24002 + }, + { + "epoch": 0.24003, + "grad_norm": 0.8462446955922235, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 24003 + }, + { + "epoch": 0.24004, + "grad_norm": 0.8724979271571859, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 24004 + }, + { + "epoch": 0.24005, + "grad_norm": 1.0165472860385638, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 24005 + }, + { + "epoch": 0.24006, + "grad_norm": 0.808744868229454, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 24006 + }, + { + "epoch": 0.24007, + "grad_norm": 0.6461027697386886, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 24007 + }, + { + "epoch": 0.24008, + "grad_norm": 0.7257636692510661, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 24008 + }, + { + "epoch": 0.24009, + "grad_norm": 0.7879524264192855, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 24009 + }, + { + "epoch": 0.2401, + "grad_norm": 0.8281139836294894, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 24010 + }, + { + "epoch": 0.24011, + "grad_norm": 0.8052142616607947, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 24011 + }, + { + "epoch": 0.24012, + "grad_norm": 0.8682225494515519, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 24012 + }, + { + "epoch": 0.24013, + "grad_norm": 0.9463939182510178, + "learning_rate": 0.003, + "loss": 4.052, + "step": 24013 + }, + { + "epoch": 0.24014, + "grad_norm": 0.9358372485485583, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 24014 + }, + { + "epoch": 0.24015, + "grad_norm": 0.8914743728838098, + "learning_rate": 0.003, + "loss": 4.021, + "step": 24015 + }, + { + "epoch": 0.24016, + "grad_norm": 0.82966190079915, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 24016 + }, + { + "epoch": 0.24017, + "grad_norm": 0.9872198219003735, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 24017 + }, + { + "epoch": 0.24018, + "grad_norm": 1.1194449032240241, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 24018 + }, + { + "epoch": 0.24019, + "grad_norm": 0.8812518925645841, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 24019 + }, + { + "epoch": 0.2402, + "grad_norm": 0.8529573273868983, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 24020 + }, + { + "epoch": 0.24021, + "grad_norm": 0.8574789545078222, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 24021 + }, + { + "epoch": 0.24022, + "grad_norm": 0.9072276055402922, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 24022 + }, + { + "epoch": 0.24023, + "grad_norm": 0.8182505805919096, + "learning_rate": 0.003, + "loss": 4.037, + "step": 24023 + }, + { + "epoch": 0.24024, + "grad_norm": 0.8598264411315909, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 24024 + }, + { + "epoch": 0.24025, + "grad_norm": 0.9191350454275453, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 24025 + }, + { + "epoch": 0.24026, + "grad_norm": 1.1681167880550225, + "learning_rate": 0.003, + "loss": 4.053, + "step": 24026 + }, + { + "epoch": 0.24027, + "grad_norm": 1.1422904363946265, + "learning_rate": 0.003, + "loss": 4.086, + "step": 24027 + }, + { + "epoch": 0.24028, + "grad_norm": 0.9968835088333793, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 24028 + }, + { + "epoch": 0.24029, + "grad_norm": 0.9608823465490182, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 24029 + }, + { + "epoch": 0.2403, + "grad_norm": 0.9110222516140425, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 24030 + }, + { + "epoch": 0.24031, + "grad_norm": 0.8406450963945513, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 24031 + }, + { + "epoch": 0.24032, + "grad_norm": 0.7207218593477188, + "learning_rate": 0.003, + "loss": 4.0028, + "step": 24032 + }, + { + "epoch": 0.24033, + "grad_norm": 0.7292615696924318, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 24033 + }, + { + "epoch": 0.24034, + "grad_norm": 0.7617648139464747, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 24034 + }, + { + "epoch": 0.24035, + "grad_norm": 0.8138200955879704, + "learning_rate": 0.003, + "loss": 4.052, + "step": 24035 + }, + { + "epoch": 0.24036, + "grad_norm": 0.8842014737287486, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 24036 + }, + { + "epoch": 0.24037, + "grad_norm": 0.9080577727404103, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 24037 + }, + { + "epoch": 0.24038, + "grad_norm": 0.9772843637010848, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 24038 + }, + { + "epoch": 0.24039, + "grad_norm": 0.9448048142964297, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 24039 + }, + { + "epoch": 0.2404, + "grad_norm": 0.8908336163678491, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 24040 + }, + { + "epoch": 0.24041, + "grad_norm": 0.8247061209426142, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 24041 + }, + { + "epoch": 0.24042, + "grad_norm": 0.7057120894638902, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 24042 + }, + { + "epoch": 0.24043, + "grad_norm": 0.7753361667428217, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 24043 + }, + { + "epoch": 0.24044, + "grad_norm": 0.8642769111189467, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 24044 + }, + { + "epoch": 0.24045, + "grad_norm": 0.8105010132379571, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 24045 + }, + { + "epoch": 0.24046, + "grad_norm": 0.9321346138541184, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 24046 + }, + { + "epoch": 0.24047, + "grad_norm": 1.172618432642941, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 24047 + }, + { + "epoch": 0.24048, + "grad_norm": 0.8845395641424207, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 24048 + }, + { + "epoch": 0.24049, + "grad_norm": 0.8266929907203441, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 24049 + }, + { + "epoch": 0.2405, + "grad_norm": 0.7570682587469636, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 24050 + }, + { + "epoch": 0.24051, + "grad_norm": 0.7661694990978863, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 24051 + }, + { + "epoch": 0.24052, + "grad_norm": 0.8754538021049573, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 24052 + }, + { + "epoch": 0.24053, + "grad_norm": 0.9897165739946506, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 24053 + }, + { + "epoch": 0.24054, + "grad_norm": 1.0698247632615983, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 24054 + }, + { + "epoch": 0.24055, + "grad_norm": 1.0627201411958178, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 24055 + }, + { + "epoch": 0.24056, + "grad_norm": 0.9362564997402691, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 24056 + }, + { + "epoch": 0.24057, + "grad_norm": 0.8467236984263127, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 24057 + }, + { + "epoch": 0.24058, + "grad_norm": 0.7607122760446369, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 24058 + }, + { + "epoch": 0.24059, + "grad_norm": 0.8469732919983674, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 24059 + }, + { + "epoch": 0.2406, + "grad_norm": 0.850599671449039, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 24060 + }, + { + "epoch": 0.24061, + "grad_norm": 0.797701451558165, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 24061 + }, + { + "epoch": 0.24062, + "grad_norm": 0.7576945317355044, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 24062 + }, + { + "epoch": 0.24063, + "grad_norm": 0.813543565878261, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 24063 + }, + { + "epoch": 0.24064, + "grad_norm": 0.9144138688692207, + "learning_rate": 0.003, + "loss": 4.016, + "step": 24064 + }, + { + "epoch": 0.24065, + "grad_norm": 0.9599663133224493, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 24065 + }, + { + "epoch": 0.24066, + "grad_norm": 0.9303293374328918, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 24066 + }, + { + "epoch": 0.24067, + "grad_norm": 1.126293048651876, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 24067 + }, + { + "epoch": 0.24068, + "grad_norm": 0.9795215225076365, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 24068 + }, + { + "epoch": 0.24069, + "grad_norm": 0.9155905487857411, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 24069 + }, + { + "epoch": 0.2407, + "grad_norm": 0.8820146983859433, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 24070 + }, + { + "epoch": 0.24071, + "grad_norm": 0.8586767212271174, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 24071 + }, + { + "epoch": 0.24072, + "grad_norm": 0.839633664291364, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 24072 + }, + { + "epoch": 0.24073, + "grad_norm": 0.8261316751111537, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 24073 + }, + { + "epoch": 0.24074, + "grad_norm": 0.8610115053803861, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 24074 + }, + { + "epoch": 0.24075, + "grad_norm": 0.9258687684645923, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 24075 + }, + { + "epoch": 0.24076, + "grad_norm": 0.9160259048291821, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 24076 + }, + { + "epoch": 0.24077, + "grad_norm": 0.8630133468852037, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 24077 + }, + { + "epoch": 0.24078, + "grad_norm": 0.9624628384547445, + "learning_rate": 0.003, + "loss": 4.075, + "step": 24078 + }, + { + "epoch": 0.24079, + "grad_norm": 1.2035624116654873, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 24079 + }, + { + "epoch": 0.2408, + "grad_norm": 0.9286162687609989, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 24080 + }, + { + "epoch": 0.24081, + "grad_norm": 0.8938556852424284, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 24081 + }, + { + "epoch": 0.24082, + "grad_norm": 0.9187324482863597, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 24082 + }, + { + "epoch": 0.24083, + "grad_norm": 0.8986330453597926, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 24083 + }, + { + "epoch": 0.24084, + "grad_norm": 0.7920071130393982, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 24084 + }, + { + "epoch": 0.24085, + "grad_norm": 0.8262623026327576, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 24085 + }, + { + "epoch": 0.24086, + "grad_norm": 0.9509769629925109, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 24086 + }, + { + "epoch": 0.24087, + "grad_norm": 0.9078900604210004, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 24087 + }, + { + "epoch": 0.24088, + "grad_norm": 0.8671285347289105, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 24088 + }, + { + "epoch": 0.24089, + "grad_norm": 0.7877398246175549, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 24089 + }, + { + "epoch": 0.2409, + "grad_norm": 0.9602516606503598, + "learning_rate": 0.003, + "loss": 4.034, + "step": 24090 + }, + { + "epoch": 0.24091, + "grad_norm": 1.2133207349124513, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 24091 + }, + { + "epoch": 0.24092, + "grad_norm": 0.9467568404172214, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 24092 + }, + { + "epoch": 0.24093, + "grad_norm": 0.9152984018909553, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 24093 + }, + { + "epoch": 0.24094, + "grad_norm": 0.8802671079002148, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 24094 + }, + { + "epoch": 0.24095, + "grad_norm": 0.8403928829283397, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 24095 + }, + { + "epoch": 0.24096, + "grad_norm": 0.8368004699832426, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 24096 + }, + { + "epoch": 0.24097, + "grad_norm": 0.8841447491975679, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 24097 + }, + { + "epoch": 0.24098, + "grad_norm": 0.8676446276360308, + "learning_rate": 0.003, + "loss": 4.056, + "step": 24098 + }, + { + "epoch": 0.24099, + "grad_norm": 0.6923580289860628, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 24099 + }, + { + "epoch": 0.241, + "grad_norm": 0.7559695744672192, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 24100 + }, + { + "epoch": 0.24101, + "grad_norm": 0.7833923172909907, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 24101 + }, + { + "epoch": 0.24102, + "grad_norm": 0.703475516395984, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 24102 + }, + { + "epoch": 0.24103, + "grad_norm": 0.7045020036834209, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 24103 + }, + { + "epoch": 0.24104, + "grad_norm": 0.8445827035933333, + "learning_rate": 0.003, + "loss": 4.0042, + "step": 24104 + }, + { + "epoch": 0.24105, + "grad_norm": 1.0090209676329296, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 24105 + }, + { + "epoch": 0.24106, + "grad_norm": 1.0583835261738044, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 24106 + }, + { + "epoch": 0.24107, + "grad_norm": 1.1115520289521503, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 24107 + }, + { + "epoch": 0.24108, + "grad_norm": 0.9322299807031126, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 24108 + }, + { + "epoch": 0.24109, + "grad_norm": 0.8429793769133921, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 24109 + }, + { + "epoch": 0.2411, + "grad_norm": 0.8528789339592429, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 24110 + }, + { + "epoch": 0.24111, + "grad_norm": 0.9193852572100761, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 24111 + }, + { + "epoch": 0.24112, + "grad_norm": 0.9153164852877265, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 24112 + }, + { + "epoch": 0.24113, + "grad_norm": 0.9015085025933132, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 24113 + }, + { + "epoch": 0.24114, + "grad_norm": 0.8491972547100965, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 24114 + }, + { + "epoch": 0.24115, + "grad_norm": 0.8471075219888548, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 24115 + }, + { + "epoch": 0.24116, + "grad_norm": 0.8113950724145746, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 24116 + }, + { + "epoch": 0.24117, + "grad_norm": 0.9034267834812154, + "learning_rate": 0.003, + "loss": 4.045, + "step": 24117 + }, + { + "epoch": 0.24118, + "grad_norm": 0.970666594247184, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 24118 + }, + { + "epoch": 0.24119, + "grad_norm": 1.161198757457236, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 24119 + }, + { + "epoch": 0.2412, + "grad_norm": 1.0344792926560433, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 24120 + }, + { + "epoch": 0.24121, + "grad_norm": 1.0425612444621266, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 24121 + }, + { + "epoch": 0.24122, + "grad_norm": 0.9970462459624021, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 24122 + }, + { + "epoch": 0.24123, + "grad_norm": 0.9870734396806171, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 24123 + }, + { + "epoch": 0.24124, + "grad_norm": 1.0518746016196538, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 24124 + }, + { + "epoch": 0.24125, + "grad_norm": 0.9057410869551177, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 24125 + }, + { + "epoch": 0.24126, + "grad_norm": 0.9588833475371494, + "learning_rate": 0.003, + "loss": 4.057, + "step": 24126 + }, + { + "epoch": 0.24127, + "grad_norm": 1.0643234806375812, + "learning_rate": 0.003, + "loss": 4.036, + "step": 24127 + }, + { + "epoch": 0.24128, + "grad_norm": 0.9236134913779293, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 24128 + }, + { + "epoch": 0.24129, + "grad_norm": 1.0624944274734087, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 24129 + }, + { + "epoch": 0.2413, + "grad_norm": 0.8791159413056923, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 24130 + }, + { + "epoch": 0.24131, + "grad_norm": 0.858911480960589, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 24131 + }, + { + "epoch": 0.24132, + "grad_norm": 0.9598499087669667, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 24132 + }, + { + "epoch": 0.24133, + "grad_norm": 1.2988452788383125, + "learning_rate": 0.003, + "loss": 4.046, + "step": 24133 + }, + { + "epoch": 0.24134, + "grad_norm": 0.7033876645181483, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 24134 + }, + { + "epoch": 0.24135, + "grad_norm": 0.6671360324346581, + "learning_rate": 0.003, + "loss": 4.058, + "step": 24135 + }, + { + "epoch": 0.24136, + "grad_norm": 0.6994956701561872, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 24136 + }, + { + "epoch": 0.24137, + "grad_norm": 0.707419260028128, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 24137 + }, + { + "epoch": 0.24138, + "grad_norm": 0.7537736726141795, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 24138 + }, + { + "epoch": 0.24139, + "grad_norm": 0.8303890492377798, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 24139 + }, + { + "epoch": 0.2414, + "grad_norm": 0.8807602694364087, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 24140 + }, + { + "epoch": 0.24141, + "grad_norm": 0.9473322044883445, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 24141 + }, + { + "epoch": 0.24142, + "grad_norm": 0.9988504915189392, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 24142 + }, + { + "epoch": 0.24143, + "grad_norm": 0.9432456202794777, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 24143 + }, + { + "epoch": 0.24144, + "grad_norm": 0.7894190853104446, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 24144 + }, + { + "epoch": 0.24145, + "grad_norm": 0.6556073314689115, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 24145 + }, + { + "epoch": 0.24146, + "grad_norm": 0.6966591763929674, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 24146 + }, + { + "epoch": 0.24147, + "grad_norm": 0.6634421876500709, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 24147 + }, + { + "epoch": 0.24148, + "grad_norm": 0.5720177451785374, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 24148 + }, + { + "epoch": 0.24149, + "grad_norm": 0.6204068925130867, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 24149 + }, + { + "epoch": 0.2415, + "grad_norm": 0.651967665265683, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 24150 + }, + { + "epoch": 0.24151, + "grad_norm": 0.6953440635861076, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 24151 + }, + { + "epoch": 0.24152, + "grad_norm": 0.647135884978751, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 24152 + }, + { + "epoch": 0.24153, + "grad_norm": 0.674243564605426, + "learning_rate": 0.003, + "loss": 4.032, + "step": 24153 + }, + { + "epoch": 0.24154, + "grad_norm": 0.8310934764623221, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 24154 + }, + { + "epoch": 0.24155, + "grad_norm": 0.9243250824534507, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 24155 + }, + { + "epoch": 0.24156, + "grad_norm": 0.937391872060682, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 24156 + }, + { + "epoch": 0.24157, + "grad_norm": 1.2233535253653858, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 24157 + }, + { + "epoch": 0.24158, + "grad_norm": 0.9254489179576979, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 24158 + }, + { + "epoch": 0.24159, + "grad_norm": 0.8716597696921334, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 24159 + }, + { + "epoch": 0.2416, + "grad_norm": 0.7376596928044449, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 24160 + }, + { + "epoch": 0.24161, + "grad_norm": 0.6792144436005407, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 24161 + }, + { + "epoch": 0.24162, + "grad_norm": 0.7410103196910114, + "learning_rate": 0.003, + "loss": 4.065, + "step": 24162 + }, + { + "epoch": 0.24163, + "grad_norm": 0.7761152322175966, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 24163 + }, + { + "epoch": 0.24164, + "grad_norm": 0.8557496137302675, + "learning_rate": 0.003, + "loss": 4.01, + "step": 24164 + }, + { + "epoch": 0.24165, + "grad_norm": 0.9968533721175374, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 24165 + }, + { + "epoch": 0.24166, + "grad_norm": 1.0736602698831794, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 24166 + }, + { + "epoch": 0.24167, + "grad_norm": 0.9335017312424864, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 24167 + }, + { + "epoch": 0.24168, + "grad_norm": 1.002279588368271, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 24168 + }, + { + "epoch": 0.24169, + "grad_norm": 1.0304395048539323, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 24169 + }, + { + "epoch": 0.2417, + "grad_norm": 0.9004535679958675, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 24170 + }, + { + "epoch": 0.24171, + "grad_norm": 0.7623714937196778, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 24171 + }, + { + "epoch": 0.24172, + "grad_norm": 0.7925348605122874, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 24172 + }, + { + "epoch": 0.24173, + "grad_norm": 0.8022218942854448, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 24173 + }, + { + "epoch": 0.24174, + "grad_norm": 0.9715698907444348, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 24174 + }, + { + "epoch": 0.24175, + "grad_norm": 1.167788691350063, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 24175 + }, + { + "epoch": 0.24176, + "grad_norm": 0.9657799070341617, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 24176 + }, + { + "epoch": 0.24177, + "grad_norm": 1.0941027355318231, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 24177 + }, + { + "epoch": 0.24178, + "grad_norm": 1.0371006729763246, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 24178 + }, + { + "epoch": 0.24179, + "grad_norm": 0.8953247629232066, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 24179 + }, + { + "epoch": 0.2418, + "grad_norm": 1.0244448704305995, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 24180 + }, + { + "epoch": 0.24181, + "grad_norm": 1.0836058961835568, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 24181 + }, + { + "epoch": 0.24182, + "grad_norm": 1.0545956331127269, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 24182 + }, + { + "epoch": 0.24183, + "grad_norm": 1.0061748330916442, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 24183 + }, + { + "epoch": 0.24184, + "grad_norm": 0.956632143181476, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 24184 + }, + { + "epoch": 0.24185, + "grad_norm": 0.966335582308729, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 24185 + }, + { + "epoch": 0.24186, + "grad_norm": 1.053920106041618, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 24186 + }, + { + "epoch": 0.24187, + "grad_norm": 1.1552608542545688, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 24187 + }, + { + "epoch": 0.24188, + "grad_norm": 0.9568289460956998, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 24188 + }, + { + "epoch": 0.24189, + "grad_norm": 1.0775055250530354, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 24189 + }, + { + "epoch": 0.2419, + "grad_norm": 0.8990994611987276, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 24190 + }, + { + "epoch": 0.24191, + "grad_norm": 0.6953617355084967, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 24191 + }, + { + "epoch": 0.24192, + "grad_norm": 0.7683277587199077, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 24192 + }, + { + "epoch": 0.24193, + "grad_norm": 0.7317281515955276, + "learning_rate": 0.003, + "loss": 4.052, + "step": 24193 + }, + { + "epoch": 0.24194, + "grad_norm": 0.8010020901327233, + "learning_rate": 0.003, + "loss": 4.043, + "step": 24194 + }, + { + "epoch": 0.24195, + "grad_norm": 0.8871916582791392, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 24195 + }, + { + "epoch": 0.24196, + "grad_norm": 1.0477912538401217, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 24196 + }, + { + "epoch": 0.24197, + "grad_norm": 1.179676640863997, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 24197 + }, + { + "epoch": 0.24198, + "grad_norm": 0.7469104539208697, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 24198 + }, + { + "epoch": 0.24199, + "grad_norm": 0.6710772028299249, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 24199 + }, + { + "epoch": 0.242, + "grad_norm": 0.642294993470878, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 24200 + }, + { + "epoch": 0.24201, + "grad_norm": 0.6277056825248893, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 24201 + }, + { + "epoch": 0.24202, + "grad_norm": 0.5732283591920163, + "learning_rate": 0.003, + "loss": 4.034, + "step": 24202 + }, + { + "epoch": 0.24203, + "grad_norm": 0.5988497386267027, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 24203 + }, + { + "epoch": 0.24204, + "grad_norm": 0.676491342829408, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 24204 + }, + { + "epoch": 0.24205, + "grad_norm": 0.6355924719006589, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 24205 + }, + { + "epoch": 0.24206, + "grad_norm": 0.7630946632554243, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 24206 + }, + { + "epoch": 0.24207, + "grad_norm": 0.9743088613115153, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 24207 + }, + { + "epoch": 0.24208, + "grad_norm": 1.1462597885369197, + "learning_rate": 0.003, + "loss": 4.061, + "step": 24208 + }, + { + "epoch": 0.24209, + "grad_norm": 0.7463667024549048, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 24209 + }, + { + "epoch": 0.2421, + "grad_norm": 0.5975735189894466, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 24210 + }, + { + "epoch": 0.24211, + "grad_norm": 0.5889850735651799, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 24211 + }, + { + "epoch": 0.24212, + "grad_norm": 0.6778442522801655, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 24212 + }, + { + "epoch": 0.24213, + "grad_norm": 0.6743831995812118, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 24213 + }, + { + "epoch": 0.24214, + "grad_norm": 0.6537179332625822, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 24214 + }, + { + "epoch": 0.24215, + "grad_norm": 0.6863728873890239, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 24215 + }, + { + "epoch": 0.24216, + "grad_norm": 0.7716472549885176, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 24216 + }, + { + "epoch": 0.24217, + "grad_norm": 0.8782807038157397, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 24217 + }, + { + "epoch": 0.24218, + "grad_norm": 0.9787453524008378, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 24218 + }, + { + "epoch": 0.24219, + "grad_norm": 1.0638054005236948, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 24219 + }, + { + "epoch": 0.2422, + "grad_norm": 1.0092302853948862, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 24220 + }, + { + "epoch": 0.24221, + "grad_norm": 1.034893608291703, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 24221 + }, + { + "epoch": 0.24222, + "grad_norm": 0.8978393050177532, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 24222 + }, + { + "epoch": 0.24223, + "grad_norm": 0.7945652743714896, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 24223 + }, + { + "epoch": 0.24224, + "grad_norm": 0.9326021576163757, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 24224 + }, + { + "epoch": 0.24225, + "grad_norm": 1.068358400059819, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 24225 + }, + { + "epoch": 0.24226, + "grad_norm": 1.0259352143570122, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 24226 + }, + { + "epoch": 0.24227, + "grad_norm": 1.0682534945349964, + "learning_rate": 0.003, + "loss": 4.082, + "step": 24227 + }, + { + "epoch": 0.24228, + "grad_norm": 1.0588508751646972, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 24228 + }, + { + "epoch": 0.24229, + "grad_norm": 1.1630363174986336, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 24229 + }, + { + "epoch": 0.2423, + "grad_norm": 1.0040485555428182, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 24230 + }, + { + "epoch": 0.24231, + "grad_norm": 1.1284992210312648, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 24231 + }, + { + "epoch": 0.24232, + "grad_norm": 1.1364189822818151, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 24232 + }, + { + "epoch": 0.24233, + "grad_norm": 0.8012730611671595, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 24233 + }, + { + "epoch": 0.24234, + "grad_norm": 0.7666971470983873, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 24234 + }, + { + "epoch": 0.24235, + "grad_norm": 0.8723108545176743, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 24235 + }, + { + "epoch": 0.24236, + "grad_norm": 0.8902507190691931, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 24236 + }, + { + "epoch": 0.24237, + "grad_norm": 1.054389849590314, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 24237 + }, + { + "epoch": 0.24238, + "grad_norm": 1.0113354389044351, + "learning_rate": 0.003, + "loss": 4.062, + "step": 24238 + }, + { + "epoch": 0.24239, + "grad_norm": 1.1018681673602049, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 24239 + }, + { + "epoch": 0.2424, + "grad_norm": 1.012787456329008, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 24240 + }, + { + "epoch": 0.24241, + "grad_norm": 0.9098124280491132, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 24241 + }, + { + "epoch": 0.24242, + "grad_norm": 0.9162320998383073, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 24242 + }, + { + "epoch": 0.24243, + "grad_norm": 0.9171120967426635, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 24243 + }, + { + "epoch": 0.24244, + "grad_norm": 0.8288125447727074, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 24244 + }, + { + "epoch": 0.24245, + "grad_norm": 0.7921256702533814, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 24245 + }, + { + "epoch": 0.24246, + "grad_norm": 0.6547228551426508, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 24246 + }, + { + "epoch": 0.24247, + "grad_norm": 0.6154267253320896, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 24247 + }, + { + "epoch": 0.24248, + "grad_norm": 0.5655510060592761, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 24248 + }, + { + "epoch": 0.24249, + "grad_norm": 0.5656358907799476, + "learning_rate": 0.003, + "loss": 4.04, + "step": 24249 + }, + { + "epoch": 0.2425, + "grad_norm": 0.6595247236447949, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 24250 + }, + { + "epoch": 0.24251, + "grad_norm": 0.7606006796457544, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 24251 + }, + { + "epoch": 0.24252, + "grad_norm": 0.7728654368369797, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 24252 + }, + { + "epoch": 0.24253, + "grad_norm": 0.8125461978029466, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 24253 + }, + { + "epoch": 0.24254, + "grad_norm": 1.0074276727792024, + "learning_rate": 0.003, + "loss": 4.039, + "step": 24254 + }, + { + "epoch": 0.24255, + "grad_norm": 1.3180408203580654, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 24255 + }, + { + "epoch": 0.24256, + "grad_norm": 0.8490214284262066, + "learning_rate": 0.003, + "loss": 4.032, + "step": 24256 + }, + { + "epoch": 0.24257, + "grad_norm": 0.8766396399202889, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 24257 + }, + { + "epoch": 0.24258, + "grad_norm": 0.7682491727145239, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 24258 + }, + { + "epoch": 0.24259, + "grad_norm": 0.7727747110084662, + "learning_rate": 0.003, + "loss": 4.002, + "step": 24259 + }, + { + "epoch": 0.2426, + "grad_norm": 0.9023445328646724, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 24260 + }, + { + "epoch": 0.24261, + "grad_norm": 1.0851170341260725, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 24261 + }, + { + "epoch": 0.24262, + "grad_norm": 1.1201151544391619, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 24262 + }, + { + "epoch": 0.24263, + "grad_norm": 0.7282776761565849, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 24263 + }, + { + "epoch": 0.24264, + "grad_norm": 0.7703995668135574, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 24264 + }, + { + "epoch": 0.24265, + "grad_norm": 0.9032639226083838, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 24265 + }, + { + "epoch": 0.24266, + "grad_norm": 1.002913863602453, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 24266 + }, + { + "epoch": 0.24267, + "grad_norm": 1.052561614646673, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 24267 + }, + { + "epoch": 0.24268, + "grad_norm": 0.9347242590951947, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 24268 + }, + { + "epoch": 0.24269, + "grad_norm": 0.9820523379471302, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 24269 + }, + { + "epoch": 0.2427, + "grad_norm": 1.0924795750111573, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 24270 + }, + { + "epoch": 0.24271, + "grad_norm": 1.0393275879710435, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 24271 + }, + { + "epoch": 0.24272, + "grad_norm": 0.9728710896968386, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 24272 + }, + { + "epoch": 0.24273, + "grad_norm": 0.8816673775283153, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 24273 + }, + { + "epoch": 0.24274, + "grad_norm": 1.0100960014260774, + "learning_rate": 0.003, + "loss": 4.07, + "step": 24274 + }, + { + "epoch": 0.24275, + "grad_norm": 1.2296447550917293, + "learning_rate": 0.003, + "loss": 4.048, + "step": 24275 + }, + { + "epoch": 0.24276, + "grad_norm": 0.8795849839402438, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 24276 + }, + { + "epoch": 0.24277, + "grad_norm": 0.7322407848306921, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 24277 + }, + { + "epoch": 0.24278, + "grad_norm": 0.7325024425579449, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 24278 + }, + { + "epoch": 0.24279, + "grad_norm": 0.6664534797868003, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 24279 + }, + { + "epoch": 0.2428, + "grad_norm": 0.6453087650570909, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 24280 + }, + { + "epoch": 0.24281, + "grad_norm": 0.6923670370070717, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 24281 + }, + { + "epoch": 0.24282, + "grad_norm": 0.6484054864314717, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 24282 + }, + { + "epoch": 0.24283, + "grad_norm": 0.7605076425691207, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 24283 + }, + { + "epoch": 0.24284, + "grad_norm": 0.8429779027342893, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 24284 + }, + { + "epoch": 0.24285, + "grad_norm": 0.7898554949164536, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 24285 + }, + { + "epoch": 0.24286, + "grad_norm": 0.6933544366141227, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 24286 + }, + { + "epoch": 0.24287, + "grad_norm": 0.6796303861773102, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 24287 + }, + { + "epoch": 0.24288, + "grad_norm": 0.6029291372376555, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 24288 + }, + { + "epoch": 0.24289, + "grad_norm": 0.5881464980014078, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 24289 + }, + { + "epoch": 0.2429, + "grad_norm": 0.651136758179586, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 24290 + }, + { + "epoch": 0.24291, + "grad_norm": 0.9153731252558913, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 24291 + }, + { + "epoch": 0.24292, + "grad_norm": 1.3474063057734336, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 24292 + }, + { + "epoch": 0.24293, + "grad_norm": 0.677031978897014, + "learning_rate": 0.003, + "loss": 3.997, + "step": 24293 + }, + { + "epoch": 0.24294, + "grad_norm": 0.6731321064724827, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 24294 + }, + { + "epoch": 0.24295, + "grad_norm": 0.6798318244558856, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 24295 + }, + { + "epoch": 0.24296, + "grad_norm": 0.6375843831806745, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 24296 + }, + { + "epoch": 0.24297, + "grad_norm": 0.6729926264270194, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 24297 + }, + { + "epoch": 0.24298, + "grad_norm": 0.7324733622903582, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 24298 + }, + { + "epoch": 0.24299, + "grad_norm": 0.9069620571468769, + "learning_rate": 0.003, + "loss": 4.031, + "step": 24299 + }, + { + "epoch": 0.243, + "grad_norm": 0.9972826409042985, + "learning_rate": 0.003, + "loss": 4.047, + "step": 24300 + }, + { + "epoch": 0.24301, + "grad_norm": 1.1127576757734765, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 24301 + }, + { + "epoch": 0.24302, + "grad_norm": 1.0143016483866745, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 24302 + }, + { + "epoch": 0.24303, + "grad_norm": 0.9381428663087634, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 24303 + }, + { + "epoch": 0.24304, + "grad_norm": 1.0541222790608915, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 24304 + }, + { + "epoch": 0.24305, + "grad_norm": 1.220678775709384, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 24305 + }, + { + "epoch": 0.24306, + "grad_norm": 0.8765027876004091, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 24306 + }, + { + "epoch": 0.24307, + "grad_norm": 0.9817730369750407, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 24307 + }, + { + "epoch": 0.24308, + "grad_norm": 1.1872241326313566, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 24308 + }, + { + "epoch": 0.24309, + "grad_norm": 0.782724219798603, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 24309 + }, + { + "epoch": 0.2431, + "grad_norm": 0.7776886030186891, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 24310 + }, + { + "epoch": 0.24311, + "grad_norm": 0.8318199707466516, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 24311 + }, + { + "epoch": 0.24312, + "grad_norm": 0.8426983867386284, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 24312 + }, + { + "epoch": 0.24313, + "grad_norm": 0.852138243236149, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 24313 + }, + { + "epoch": 0.24314, + "grad_norm": 1.1302089332431762, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 24314 + }, + { + "epoch": 0.24315, + "grad_norm": 1.0320332850477054, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 24315 + }, + { + "epoch": 0.24316, + "grad_norm": 0.916896084894644, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 24316 + }, + { + "epoch": 0.24317, + "grad_norm": 0.8335526627679575, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 24317 + }, + { + "epoch": 0.24318, + "grad_norm": 0.9103889407566678, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 24318 + }, + { + "epoch": 0.24319, + "grad_norm": 1.100555373003163, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 24319 + }, + { + "epoch": 0.2432, + "grad_norm": 0.7989921546119533, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 24320 + }, + { + "epoch": 0.24321, + "grad_norm": 0.758276426865167, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 24321 + }, + { + "epoch": 0.24322, + "grad_norm": 0.8109402183804613, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 24322 + }, + { + "epoch": 0.24323, + "grad_norm": 0.8753917152344733, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 24323 + }, + { + "epoch": 0.24324, + "grad_norm": 1.0209258137716997, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 24324 + }, + { + "epoch": 0.24325, + "grad_norm": 1.2201533941441849, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 24325 + }, + { + "epoch": 0.24326, + "grad_norm": 0.8374792611631589, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 24326 + }, + { + "epoch": 0.24327, + "grad_norm": 0.8356536837201828, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 24327 + }, + { + "epoch": 0.24328, + "grad_norm": 0.8125940116736208, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 24328 + }, + { + "epoch": 0.24329, + "grad_norm": 0.844528381739035, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 24329 + }, + { + "epoch": 0.2433, + "grad_norm": 0.9276686820154229, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 24330 + }, + { + "epoch": 0.24331, + "grad_norm": 1.0918135620640965, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 24331 + }, + { + "epoch": 0.24332, + "grad_norm": 0.9022723421239631, + "learning_rate": 0.003, + "loss": 4.078, + "step": 24332 + }, + { + "epoch": 0.24333, + "grad_norm": 0.8522007131246603, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 24333 + }, + { + "epoch": 0.24334, + "grad_norm": 0.9088536463096427, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 24334 + }, + { + "epoch": 0.24335, + "grad_norm": 0.8795732024738103, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 24335 + }, + { + "epoch": 0.24336, + "grad_norm": 0.8419468767859976, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 24336 + }, + { + "epoch": 0.24337, + "grad_norm": 0.847179246601703, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 24337 + }, + { + "epoch": 0.24338, + "grad_norm": 0.816899248272882, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 24338 + }, + { + "epoch": 0.24339, + "grad_norm": 0.7476287007640905, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 24339 + }, + { + "epoch": 0.2434, + "grad_norm": 0.7355001806883271, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 24340 + }, + { + "epoch": 0.24341, + "grad_norm": 0.883264695970655, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 24341 + }, + { + "epoch": 0.24342, + "grad_norm": 1.147953845061727, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 24342 + }, + { + "epoch": 0.24343, + "grad_norm": 0.9312668237821587, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 24343 + }, + { + "epoch": 0.24344, + "grad_norm": 1.0466720575363302, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 24344 + }, + { + "epoch": 0.24345, + "grad_norm": 1.1412413053078623, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 24345 + }, + { + "epoch": 0.24346, + "grad_norm": 0.9516014194879931, + "learning_rate": 0.003, + "loss": 4.063, + "step": 24346 + }, + { + "epoch": 0.24347, + "grad_norm": 0.9382388128949345, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 24347 + }, + { + "epoch": 0.24348, + "grad_norm": 0.9244033979742177, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 24348 + }, + { + "epoch": 0.24349, + "grad_norm": 0.8806876867652167, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 24349 + }, + { + "epoch": 0.2435, + "grad_norm": 0.8774994066516855, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 24350 + }, + { + "epoch": 0.24351, + "grad_norm": 0.9669429264958953, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 24351 + }, + { + "epoch": 0.24352, + "grad_norm": 0.9750995866386545, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 24352 + }, + { + "epoch": 0.24353, + "grad_norm": 1.0642171007194772, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 24353 + }, + { + "epoch": 0.24354, + "grad_norm": 1.023318935289836, + "learning_rate": 0.003, + "loss": 4.07, + "step": 24354 + }, + { + "epoch": 0.24355, + "grad_norm": 1.037057094905722, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 24355 + }, + { + "epoch": 0.24356, + "grad_norm": 0.8528389113927962, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 24356 + }, + { + "epoch": 0.24357, + "grad_norm": 0.8084312248659596, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 24357 + }, + { + "epoch": 0.24358, + "grad_norm": 0.8994360509207241, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 24358 + }, + { + "epoch": 0.24359, + "grad_norm": 0.9506378387000618, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 24359 + }, + { + "epoch": 0.2436, + "grad_norm": 0.8435290336291547, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 24360 + }, + { + "epoch": 0.24361, + "grad_norm": 0.6930848325647802, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 24361 + }, + { + "epoch": 0.24362, + "grad_norm": 0.6855402735593364, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 24362 + }, + { + "epoch": 0.24363, + "grad_norm": 0.6282268791818146, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 24363 + }, + { + "epoch": 0.24364, + "grad_norm": 0.6529410185477602, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 24364 + }, + { + "epoch": 0.24365, + "grad_norm": 0.7812121432098154, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 24365 + }, + { + "epoch": 0.24366, + "grad_norm": 1.1277676898190097, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 24366 + }, + { + "epoch": 0.24367, + "grad_norm": 1.1297822136217337, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 24367 + }, + { + "epoch": 0.24368, + "grad_norm": 0.8177340570124606, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 24368 + }, + { + "epoch": 0.24369, + "grad_norm": 0.911387406128397, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 24369 + }, + { + "epoch": 0.2437, + "grad_norm": 0.9365083714800969, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 24370 + }, + { + "epoch": 0.24371, + "grad_norm": 0.858893091702094, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 24371 + }, + { + "epoch": 0.24372, + "grad_norm": 0.9163546378870618, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 24372 + }, + { + "epoch": 0.24373, + "grad_norm": 1.0582870825659023, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 24373 + }, + { + "epoch": 0.24374, + "grad_norm": 0.9334249060481281, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 24374 + }, + { + "epoch": 0.24375, + "grad_norm": 0.9799317083201554, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 24375 + }, + { + "epoch": 0.24376, + "grad_norm": 1.1045988202635622, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 24376 + }, + { + "epoch": 0.24377, + "grad_norm": 0.9975518141001947, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 24377 + }, + { + "epoch": 0.24378, + "grad_norm": 0.9658299093965321, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 24378 + }, + { + "epoch": 0.24379, + "grad_norm": 0.8155898042883143, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 24379 + }, + { + "epoch": 0.2438, + "grad_norm": 0.814010990543558, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 24380 + }, + { + "epoch": 0.24381, + "grad_norm": 0.8087958503543714, + "learning_rate": 0.003, + "loss": 4.06, + "step": 24381 + }, + { + "epoch": 0.24382, + "grad_norm": 0.8697846829860854, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 24382 + }, + { + "epoch": 0.24383, + "grad_norm": 0.9175689317485503, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 24383 + }, + { + "epoch": 0.24384, + "grad_norm": 1.005163379336692, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 24384 + }, + { + "epoch": 0.24385, + "grad_norm": 1.0397244058946067, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 24385 + }, + { + "epoch": 0.24386, + "grad_norm": 0.8556278638141066, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 24386 + }, + { + "epoch": 0.24387, + "grad_norm": 0.803804951620591, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 24387 + }, + { + "epoch": 0.24388, + "grad_norm": 0.8118970192039237, + "learning_rate": 0.003, + "loss": 4.032, + "step": 24388 + }, + { + "epoch": 0.24389, + "grad_norm": 0.9646815950518908, + "learning_rate": 0.003, + "loss": 4.066, + "step": 24389 + }, + { + "epoch": 0.2439, + "grad_norm": 1.0119267231370443, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 24390 + }, + { + "epoch": 0.24391, + "grad_norm": 1.0989985571835281, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 24391 + }, + { + "epoch": 0.24392, + "grad_norm": 0.9792742723407618, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 24392 + }, + { + "epoch": 0.24393, + "grad_norm": 0.888863836050796, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 24393 + }, + { + "epoch": 0.24394, + "grad_norm": 0.7022669694122133, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 24394 + }, + { + "epoch": 0.24395, + "grad_norm": 0.6878590637291429, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 24395 + }, + { + "epoch": 0.24396, + "grad_norm": 0.5908483768920613, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 24396 + }, + { + "epoch": 0.24397, + "grad_norm": 0.5355051428876914, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 24397 + }, + { + "epoch": 0.24398, + "grad_norm": 0.5372096324250133, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 24398 + }, + { + "epoch": 0.24399, + "grad_norm": 0.5651486344449366, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 24399 + }, + { + "epoch": 0.244, + "grad_norm": 0.601368453393794, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 24400 + }, + { + "epoch": 0.24401, + "grad_norm": 0.6220715313771641, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 24401 + }, + { + "epoch": 0.24402, + "grad_norm": 0.6163657436173646, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 24402 + }, + { + "epoch": 0.24403, + "grad_norm": 0.6155634994260076, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 24403 + }, + { + "epoch": 0.24404, + "grad_norm": 0.6122338660245654, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 24404 + }, + { + "epoch": 0.24405, + "grad_norm": 0.6586059752067807, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 24405 + }, + { + "epoch": 0.24406, + "grad_norm": 0.8409746987929847, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 24406 + }, + { + "epoch": 0.24407, + "grad_norm": 1.2167870661472961, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 24407 + }, + { + "epoch": 0.24408, + "grad_norm": 1.1624399269643086, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 24408 + }, + { + "epoch": 0.24409, + "grad_norm": 1.0177443994499635, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 24409 + }, + { + "epoch": 0.2441, + "grad_norm": 1.088981240220816, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 24410 + }, + { + "epoch": 0.24411, + "grad_norm": 0.9805322389670114, + "learning_rate": 0.003, + "loss": 4.031, + "step": 24411 + }, + { + "epoch": 0.24412, + "grad_norm": 0.983151090768275, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 24412 + }, + { + "epoch": 0.24413, + "grad_norm": 1.0216249476932648, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 24413 + }, + { + "epoch": 0.24414, + "grad_norm": 0.9387040218719899, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 24414 + }, + { + "epoch": 0.24415, + "grad_norm": 0.9427996239529268, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 24415 + }, + { + "epoch": 0.24416, + "grad_norm": 0.7726171323072034, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 24416 + }, + { + "epoch": 0.24417, + "grad_norm": 0.8761754992263954, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 24417 + }, + { + "epoch": 0.24418, + "grad_norm": 1.0686455921997247, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 24418 + }, + { + "epoch": 0.24419, + "grad_norm": 1.0908022871719525, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 24419 + }, + { + "epoch": 0.2442, + "grad_norm": 0.8541406837957457, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 24420 + }, + { + "epoch": 0.24421, + "grad_norm": 0.885758959494135, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 24421 + }, + { + "epoch": 0.24422, + "grad_norm": 0.9547142044914406, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 24422 + }, + { + "epoch": 0.24423, + "grad_norm": 1.0533696961035683, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 24423 + }, + { + "epoch": 0.24424, + "grad_norm": 0.969538276128009, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 24424 + }, + { + "epoch": 0.24425, + "grad_norm": 1.0818288288985716, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 24425 + }, + { + "epoch": 0.24426, + "grad_norm": 1.034316125393436, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 24426 + }, + { + "epoch": 0.24427, + "grad_norm": 0.9508345299795138, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 24427 + }, + { + "epoch": 0.24428, + "grad_norm": 1.045863445948455, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 24428 + }, + { + "epoch": 0.24429, + "grad_norm": 1.0239659339972875, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 24429 + }, + { + "epoch": 0.2443, + "grad_norm": 1.039770757932298, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 24430 + }, + { + "epoch": 0.24431, + "grad_norm": 1.0297198312633498, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 24431 + }, + { + "epoch": 0.24432, + "grad_norm": 0.9654788859834478, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 24432 + }, + { + "epoch": 0.24433, + "grad_norm": 0.9690528358014315, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 24433 + }, + { + "epoch": 0.24434, + "grad_norm": 1.0109256894738612, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 24434 + }, + { + "epoch": 0.24435, + "grad_norm": 1.0010957223393737, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 24435 + }, + { + "epoch": 0.24436, + "grad_norm": 1.054536591309224, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 24436 + }, + { + "epoch": 0.24437, + "grad_norm": 1.0223286449754077, + "learning_rate": 0.003, + "loss": 4.038, + "step": 24437 + }, + { + "epoch": 0.24438, + "grad_norm": 0.9570499914386106, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 24438 + }, + { + "epoch": 0.24439, + "grad_norm": 0.9422571672786546, + "learning_rate": 0.003, + "loss": 4.058, + "step": 24439 + }, + { + "epoch": 0.2444, + "grad_norm": 0.9645464832832905, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 24440 + }, + { + "epoch": 0.24441, + "grad_norm": 1.0420590883156042, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 24441 + }, + { + "epoch": 0.24442, + "grad_norm": 0.852256173245297, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 24442 + }, + { + "epoch": 0.24443, + "grad_norm": 0.7411281220278786, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 24443 + }, + { + "epoch": 0.24444, + "grad_norm": 0.712094237709309, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 24444 + }, + { + "epoch": 0.24445, + "grad_norm": 0.8415610013976501, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 24445 + }, + { + "epoch": 0.24446, + "grad_norm": 0.947926946681682, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 24446 + }, + { + "epoch": 0.24447, + "grad_norm": 1.0947504770310268, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 24447 + }, + { + "epoch": 0.24448, + "grad_norm": 0.9135832351063355, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 24448 + }, + { + "epoch": 0.24449, + "grad_norm": 0.925166794047048, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 24449 + }, + { + "epoch": 0.2445, + "grad_norm": 1.1143021409451108, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 24450 + }, + { + "epoch": 0.24451, + "grad_norm": 0.9939752701268647, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 24451 + }, + { + "epoch": 0.24452, + "grad_norm": 1.0081036507024388, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 24452 + }, + { + "epoch": 0.24453, + "grad_norm": 1.018741683828051, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 24453 + }, + { + "epoch": 0.24454, + "grad_norm": 0.8103139534307585, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 24454 + }, + { + "epoch": 0.24455, + "grad_norm": 0.7462387815193459, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 24455 + }, + { + "epoch": 0.24456, + "grad_norm": 0.7286375087075916, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 24456 + }, + { + "epoch": 0.24457, + "grad_norm": 0.7996862448008925, + "learning_rate": 0.003, + "loss": 4.049, + "step": 24457 + }, + { + "epoch": 0.24458, + "grad_norm": 0.7208759097321523, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 24458 + }, + { + "epoch": 0.24459, + "grad_norm": 0.8642368488907551, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 24459 + }, + { + "epoch": 0.2446, + "grad_norm": 1.152726837349672, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 24460 + }, + { + "epoch": 0.24461, + "grad_norm": 0.9634881046961308, + "learning_rate": 0.003, + "loss": 4.058, + "step": 24461 + }, + { + "epoch": 0.24462, + "grad_norm": 0.8070163171672409, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 24462 + }, + { + "epoch": 0.24463, + "grad_norm": 0.7314973056339306, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 24463 + }, + { + "epoch": 0.24464, + "grad_norm": 0.6358579966626574, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 24464 + }, + { + "epoch": 0.24465, + "grad_norm": 0.6533617287048956, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 24465 + }, + { + "epoch": 0.24466, + "grad_norm": 0.6458387295110005, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 24466 + }, + { + "epoch": 0.24467, + "grad_norm": 0.7789525457207829, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 24467 + }, + { + "epoch": 0.24468, + "grad_norm": 0.7741433302318864, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 24468 + }, + { + "epoch": 0.24469, + "grad_norm": 0.7069997049876953, + "learning_rate": 0.003, + "loss": 4.089, + "step": 24469 + }, + { + "epoch": 0.2447, + "grad_norm": 0.6601098571565199, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 24470 + }, + { + "epoch": 0.24471, + "grad_norm": 0.6860472221961996, + "learning_rate": 0.003, + "loss": 4.028, + "step": 24471 + }, + { + "epoch": 0.24472, + "grad_norm": 0.7542101040776972, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 24472 + }, + { + "epoch": 0.24473, + "grad_norm": 0.9479543221470939, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 24473 + }, + { + "epoch": 0.24474, + "grad_norm": 1.3122349840516991, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 24474 + }, + { + "epoch": 0.24475, + "grad_norm": 0.7782334959312933, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 24475 + }, + { + "epoch": 0.24476, + "grad_norm": 0.6485967236211962, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 24476 + }, + { + "epoch": 0.24477, + "grad_norm": 0.6695866426233174, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 24477 + }, + { + "epoch": 0.24478, + "grad_norm": 0.7679246093453307, + "learning_rate": 0.003, + "loss": 4.048, + "step": 24478 + }, + { + "epoch": 0.24479, + "grad_norm": 0.7823657632506431, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 24479 + }, + { + "epoch": 0.2448, + "grad_norm": 0.8248241381331618, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 24480 + }, + { + "epoch": 0.24481, + "grad_norm": 0.991312044256379, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 24481 + }, + { + "epoch": 0.24482, + "grad_norm": 1.209441981026284, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 24482 + }, + { + "epoch": 0.24483, + "grad_norm": 0.8359220232153316, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 24483 + }, + { + "epoch": 0.24484, + "grad_norm": 0.7673875730449259, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 24484 + }, + { + "epoch": 0.24485, + "grad_norm": 0.8984091683424169, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 24485 + }, + { + "epoch": 0.24486, + "grad_norm": 0.9267010252873435, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 24486 + }, + { + "epoch": 0.24487, + "grad_norm": 0.9265271550963003, + "learning_rate": 0.003, + "loss": 4.043, + "step": 24487 + }, + { + "epoch": 0.24488, + "grad_norm": 0.9454430342517407, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 24488 + }, + { + "epoch": 0.24489, + "grad_norm": 1.0299287751967152, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 24489 + }, + { + "epoch": 0.2449, + "grad_norm": 1.2684214157343432, + "learning_rate": 0.003, + "loss": 4.078, + "step": 24490 + }, + { + "epoch": 0.24491, + "grad_norm": 1.0357306082180378, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 24491 + }, + { + "epoch": 0.24492, + "grad_norm": 1.0256960624927896, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 24492 + }, + { + "epoch": 0.24493, + "grad_norm": 1.0261104633484393, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 24493 + }, + { + "epoch": 0.24494, + "grad_norm": 0.9969551753139565, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 24494 + }, + { + "epoch": 0.24495, + "grad_norm": 0.9159010962327077, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 24495 + }, + { + "epoch": 0.24496, + "grad_norm": 1.0386224763459773, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 24496 + }, + { + "epoch": 0.24497, + "grad_norm": 1.112029523923787, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 24497 + }, + { + "epoch": 0.24498, + "grad_norm": 1.0846915951808902, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 24498 + }, + { + "epoch": 0.24499, + "grad_norm": 0.8904941839709802, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 24499 + }, + { + "epoch": 0.245, + "grad_norm": 0.8983980505009838, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 24500 + }, + { + "epoch": 0.24501, + "grad_norm": 0.8990993133307748, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 24501 + }, + { + "epoch": 0.24502, + "grad_norm": 0.9616035618335347, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 24502 + }, + { + "epoch": 0.24503, + "grad_norm": 1.0473821737012714, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 24503 + }, + { + "epoch": 0.24504, + "grad_norm": 1.0715748104037908, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 24504 + }, + { + "epoch": 0.24505, + "grad_norm": 0.9173272758342427, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 24505 + }, + { + "epoch": 0.24506, + "grad_norm": 0.7635617413598949, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 24506 + }, + { + "epoch": 0.24507, + "grad_norm": 0.7045735912434508, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 24507 + }, + { + "epoch": 0.24508, + "grad_norm": 0.6656034642751955, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 24508 + }, + { + "epoch": 0.24509, + "grad_norm": 0.7339683669241369, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 24509 + }, + { + "epoch": 0.2451, + "grad_norm": 0.8787924541871007, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 24510 + }, + { + "epoch": 0.24511, + "grad_norm": 1.0741159663223918, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 24511 + }, + { + "epoch": 0.24512, + "grad_norm": 0.9404947670189978, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 24512 + }, + { + "epoch": 0.24513, + "grad_norm": 0.806775192995787, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 24513 + }, + { + "epoch": 0.24514, + "grad_norm": 0.8281536041647255, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 24514 + }, + { + "epoch": 0.24515, + "grad_norm": 0.9222970560580745, + "learning_rate": 0.003, + "loss": 4.053, + "step": 24515 + }, + { + "epoch": 0.24516, + "grad_norm": 1.0001790133935085, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 24516 + }, + { + "epoch": 0.24517, + "grad_norm": 1.0525706735221212, + "learning_rate": 0.003, + "loss": 4.047, + "step": 24517 + }, + { + "epoch": 0.24518, + "grad_norm": 1.0518489591809843, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 24518 + }, + { + "epoch": 0.24519, + "grad_norm": 0.9027628384600236, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 24519 + }, + { + "epoch": 0.2452, + "grad_norm": 0.7341292918381099, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 24520 + }, + { + "epoch": 0.24521, + "grad_norm": 0.7505505315678521, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 24521 + }, + { + "epoch": 0.24522, + "grad_norm": 0.7147919366249711, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 24522 + }, + { + "epoch": 0.24523, + "grad_norm": 0.7618300206719442, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 24523 + }, + { + "epoch": 0.24524, + "grad_norm": 0.843399850314302, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 24524 + }, + { + "epoch": 0.24525, + "grad_norm": 0.8962612866561526, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 24525 + }, + { + "epoch": 0.24526, + "grad_norm": 0.9710199731710112, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 24526 + }, + { + "epoch": 0.24527, + "grad_norm": 1.0536729698385872, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 24527 + }, + { + "epoch": 0.24528, + "grad_norm": 1.1065920884567686, + "learning_rate": 0.003, + "loss": 4.05, + "step": 24528 + }, + { + "epoch": 0.24529, + "grad_norm": 0.7931915600997107, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 24529 + }, + { + "epoch": 0.2453, + "grad_norm": 0.6642813299313023, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 24530 + }, + { + "epoch": 0.24531, + "grad_norm": 0.5928295581730393, + "learning_rate": 0.003, + "loss": 4.0057, + "step": 24531 + }, + { + "epoch": 0.24532, + "grad_norm": 0.6105854159000165, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 24532 + }, + { + "epoch": 0.24533, + "grad_norm": 0.6929444411840987, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 24533 + }, + { + "epoch": 0.24534, + "grad_norm": 0.8307734920885856, + "learning_rate": 0.003, + "loss": 4.057, + "step": 24534 + }, + { + "epoch": 0.24535, + "grad_norm": 0.8812980873115231, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 24535 + }, + { + "epoch": 0.24536, + "grad_norm": 0.8540772901309754, + "learning_rate": 0.003, + "loss": 4.044, + "step": 24536 + }, + { + "epoch": 0.24537, + "grad_norm": 0.8721503281246444, + "learning_rate": 0.003, + "loss": 4.043, + "step": 24537 + }, + { + "epoch": 0.24538, + "grad_norm": 0.8826222742602462, + "learning_rate": 0.003, + "loss": 4.0052, + "step": 24538 + }, + { + "epoch": 0.24539, + "grad_norm": 1.027541132939409, + "learning_rate": 0.003, + "loss": 4.046, + "step": 24539 + }, + { + "epoch": 0.2454, + "grad_norm": 0.9479684540952781, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 24540 + }, + { + "epoch": 0.24541, + "grad_norm": 0.9098121022120571, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 24541 + }, + { + "epoch": 0.24542, + "grad_norm": 0.799271098334082, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 24542 + }, + { + "epoch": 0.24543, + "grad_norm": 0.7974412236664123, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 24543 + }, + { + "epoch": 0.24544, + "grad_norm": 0.8633252007284155, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 24544 + }, + { + "epoch": 0.24545, + "grad_norm": 1.0450055269426481, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 24545 + }, + { + "epoch": 0.24546, + "grad_norm": 1.3504095530058007, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 24546 + }, + { + "epoch": 0.24547, + "grad_norm": 0.7201918766578288, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 24547 + }, + { + "epoch": 0.24548, + "grad_norm": 0.6665553792881939, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 24548 + }, + { + "epoch": 0.24549, + "grad_norm": 0.649404244455346, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 24549 + }, + { + "epoch": 0.2455, + "grad_norm": 0.711918767363004, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 24550 + }, + { + "epoch": 0.24551, + "grad_norm": 0.8861142713760095, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 24551 + }, + { + "epoch": 0.24552, + "grad_norm": 1.1463952628985503, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 24552 + }, + { + "epoch": 0.24553, + "grad_norm": 0.9979914529419902, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 24553 + }, + { + "epoch": 0.24554, + "grad_norm": 1.0355392003761392, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 24554 + }, + { + "epoch": 0.24555, + "grad_norm": 0.7490410049192778, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 24555 + }, + { + "epoch": 0.24556, + "grad_norm": 0.7940086360621044, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 24556 + }, + { + "epoch": 0.24557, + "grad_norm": 0.8355854056301416, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 24557 + }, + { + "epoch": 0.24558, + "grad_norm": 0.9660214099487647, + "learning_rate": 0.003, + "loss": 4.075, + "step": 24558 + }, + { + "epoch": 0.24559, + "grad_norm": 1.161799568779313, + "learning_rate": 0.003, + "loss": 4.064, + "step": 24559 + }, + { + "epoch": 0.2456, + "grad_norm": 0.9141415634201788, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 24560 + }, + { + "epoch": 0.24561, + "grad_norm": 0.9220859100779807, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 24561 + }, + { + "epoch": 0.24562, + "grad_norm": 0.8563433617162411, + "learning_rate": 0.003, + "loss": 4.043, + "step": 24562 + }, + { + "epoch": 0.24563, + "grad_norm": 0.8384508116151052, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 24563 + }, + { + "epoch": 0.24564, + "grad_norm": 0.7440743867370847, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 24564 + }, + { + "epoch": 0.24565, + "grad_norm": 0.6166764581732535, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 24565 + }, + { + "epoch": 0.24566, + "grad_norm": 0.622542271787728, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 24566 + }, + { + "epoch": 0.24567, + "grad_norm": 0.740494332514769, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 24567 + }, + { + "epoch": 0.24568, + "grad_norm": 0.8810675961657772, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 24568 + }, + { + "epoch": 0.24569, + "grad_norm": 1.123409218405106, + "learning_rate": 0.003, + "loss": 4.023, + "step": 24569 + }, + { + "epoch": 0.2457, + "grad_norm": 0.9942011367916302, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 24570 + }, + { + "epoch": 0.24571, + "grad_norm": 0.9873217859039962, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 24571 + }, + { + "epoch": 0.24572, + "grad_norm": 0.9443072355720511, + "learning_rate": 0.003, + "loss": 4.05, + "step": 24572 + }, + { + "epoch": 0.24573, + "grad_norm": 0.7949518462028978, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 24573 + }, + { + "epoch": 0.24574, + "grad_norm": 0.7871394044340357, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 24574 + }, + { + "epoch": 0.24575, + "grad_norm": 0.7171278673981191, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 24575 + }, + { + "epoch": 0.24576, + "grad_norm": 0.683884836392166, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 24576 + }, + { + "epoch": 0.24577, + "grad_norm": 0.7374527050245074, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 24577 + }, + { + "epoch": 0.24578, + "grad_norm": 0.7169688555678069, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 24578 + }, + { + "epoch": 0.24579, + "grad_norm": 0.919359313311798, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 24579 + }, + { + "epoch": 0.2458, + "grad_norm": 1.0816742554865615, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 24580 + }, + { + "epoch": 0.24581, + "grad_norm": 0.9876970990361058, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 24581 + }, + { + "epoch": 0.24582, + "grad_norm": 0.9040966199309631, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 24582 + }, + { + "epoch": 0.24583, + "grad_norm": 0.7528270493757065, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 24583 + }, + { + "epoch": 0.24584, + "grad_norm": 0.644371703879505, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 24584 + }, + { + "epoch": 0.24585, + "grad_norm": 0.6791416962956901, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 24585 + }, + { + "epoch": 0.24586, + "grad_norm": 0.7969920922288772, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 24586 + }, + { + "epoch": 0.24587, + "grad_norm": 0.8669266099281332, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 24587 + }, + { + "epoch": 0.24588, + "grad_norm": 0.9344526329921855, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 24588 + }, + { + "epoch": 0.24589, + "grad_norm": 0.9238748636611341, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 24589 + }, + { + "epoch": 0.2459, + "grad_norm": 0.8562565134627064, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 24590 + }, + { + "epoch": 0.24591, + "grad_norm": 0.9157681943604944, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 24591 + }, + { + "epoch": 0.24592, + "grad_norm": 0.8768472479259104, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 24592 + }, + { + "epoch": 0.24593, + "grad_norm": 0.9366894146324585, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 24593 + }, + { + "epoch": 0.24594, + "grad_norm": 1.029622894257853, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 24594 + }, + { + "epoch": 0.24595, + "grad_norm": 1.023180264617836, + "learning_rate": 0.003, + "loss": 4.051, + "step": 24595 + }, + { + "epoch": 0.24596, + "grad_norm": 1.041952609550889, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 24596 + }, + { + "epoch": 0.24597, + "grad_norm": 1.223953981909932, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 24597 + }, + { + "epoch": 0.24598, + "grad_norm": 1.024048157605506, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 24598 + }, + { + "epoch": 0.24599, + "grad_norm": 1.039935132693132, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 24599 + }, + { + "epoch": 0.246, + "grad_norm": 1.0069069146554877, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 24600 + }, + { + "epoch": 0.24601, + "grad_norm": 1.0035211573348073, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 24601 + }, + { + "epoch": 0.24602, + "grad_norm": 1.1169823236415817, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 24602 + }, + { + "epoch": 0.24603, + "grad_norm": 0.891371798503237, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 24603 + }, + { + "epoch": 0.24604, + "grad_norm": 0.9676870444959385, + "learning_rate": 0.003, + "loss": 4.054, + "step": 24604 + }, + { + "epoch": 0.24605, + "grad_norm": 1.0489158142775563, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 24605 + }, + { + "epoch": 0.24606, + "grad_norm": 0.8795753911851342, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 24606 + }, + { + "epoch": 0.24607, + "grad_norm": 0.9008059453842032, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 24607 + }, + { + "epoch": 0.24608, + "grad_norm": 0.8948294619900213, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 24608 + }, + { + "epoch": 0.24609, + "grad_norm": 0.8669148434389932, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 24609 + }, + { + "epoch": 0.2461, + "grad_norm": 0.9476235126134983, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 24610 + }, + { + "epoch": 0.24611, + "grad_norm": 1.1555889591484974, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 24611 + }, + { + "epoch": 0.24612, + "grad_norm": 0.9734972167554912, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 24612 + }, + { + "epoch": 0.24613, + "grad_norm": 1.0066209721027795, + "learning_rate": 0.003, + "loss": 4.069, + "step": 24613 + }, + { + "epoch": 0.24614, + "grad_norm": 1.1991349465218428, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 24614 + }, + { + "epoch": 0.24615, + "grad_norm": 0.893054301207456, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 24615 + }, + { + "epoch": 0.24616, + "grad_norm": 0.7525212218009527, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 24616 + }, + { + "epoch": 0.24617, + "grad_norm": 0.6814761964706885, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 24617 + }, + { + "epoch": 0.24618, + "grad_norm": 0.6158485789533341, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 24618 + }, + { + "epoch": 0.24619, + "grad_norm": 0.599654424169733, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 24619 + }, + { + "epoch": 0.2462, + "grad_norm": 0.605089332202948, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 24620 + }, + { + "epoch": 0.24621, + "grad_norm": 0.6333693455845567, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 24621 + }, + { + "epoch": 0.24622, + "grad_norm": 0.6522380367977607, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 24622 + }, + { + "epoch": 0.24623, + "grad_norm": 0.6404569613178623, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 24623 + }, + { + "epoch": 0.24624, + "grad_norm": 0.7127617209939574, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 24624 + }, + { + "epoch": 0.24625, + "grad_norm": 0.7150750049984763, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 24625 + }, + { + "epoch": 0.24626, + "grad_norm": 0.7948715753286921, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 24626 + }, + { + "epoch": 0.24627, + "grad_norm": 0.8329950199868997, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 24627 + }, + { + "epoch": 0.24628, + "grad_norm": 1.0090902242277051, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 24628 + }, + { + "epoch": 0.24629, + "grad_norm": 1.3046360215069766, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 24629 + }, + { + "epoch": 0.2463, + "grad_norm": 0.6238796384952265, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 24630 + }, + { + "epoch": 0.24631, + "grad_norm": 0.8098156655686518, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 24631 + }, + { + "epoch": 0.24632, + "grad_norm": 1.0827874234534993, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 24632 + }, + { + "epoch": 0.24633, + "grad_norm": 1.1313240245989384, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 24633 + }, + { + "epoch": 0.24634, + "grad_norm": 0.9234789050291327, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 24634 + }, + { + "epoch": 0.24635, + "grad_norm": 0.7579391562768962, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 24635 + }, + { + "epoch": 0.24636, + "grad_norm": 0.6840384425367269, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 24636 + }, + { + "epoch": 0.24637, + "grad_norm": 0.666970778594563, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 24637 + }, + { + "epoch": 0.24638, + "grad_norm": 0.5875872466756134, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 24638 + }, + { + "epoch": 0.24639, + "grad_norm": 0.6030339869890238, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 24639 + }, + { + "epoch": 0.2464, + "grad_norm": 0.8130442360060062, + "learning_rate": 0.003, + "loss": 4.024, + "step": 24640 + }, + { + "epoch": 0.24641, + "grad_norm": 0.866987205018245, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 24641 + }, + { + "epoch": 0.24642, + "grad_norm": 0.8730514946172345, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 24642 + }, + { + "epoch": 0.24643, + "grad_norm": 0.9495781595679572, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 24643 + }, + { + "epoch": 0.24644, + "grad_norm": 1.0683735204189886, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 24644 + }, + { + "epoch": 0.24645, + "grad_norm": 1.0181861336800289, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 24645 + }, + { + "epoch": 0.24646, + "grad_norm": 0.9076283002489532, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 24646 + }, + { + "epoch": 0.24647, + "grad_norm": 0.9023066045004511, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 24647 + }, + { + "epoch": 0.24648, + "grad_norm": 0.9332708068104332, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 24648 + }, + { + "epoch": 0.24649, + "grad_norm": 1.2216648557084486, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 24649 + }, + { + "epoch": 0.2465, + "grad_norm": 0.8617397414962509, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 24650 + }, + { + "epoch": 0.24651, + "grad_norm": 0.7704009370317823, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 24651 + }, + { + "epoch": 0.24652, + "grad_norm": 0.7621235717071047, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 24652 + }, + { + "epoch": 0.24653, + "grad_norm": 0.6911446747333821, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 24653 + }, + { + "epoch": 0.24654, + "grad_norm": 0.6757519167272384, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 24654 + }, + { + "epoch": 0.24655, + "grad_norm": 0.7230822977944228, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 24655 + }, + { + "epoch": 0.24656, + "grad_norm": 0.8204899761186022, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 24656 + }, + { + "epoch": 0.24657, + "grad_norm": 0.8417548021534176, + "learning_rate": 0.003, + "loss": 4.034, + "step": 24657 + }, + { + "epoch": 0.24658, + "grad_norm": 0.6914000170242425, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 24658 + }, + { + "epoch": 0.24659, + "grad_norm": 0.7541513842650349, + "learning_rate": 0.003, + "loss": 4.039, + "step": 24659 + }, + { + "epoch": 0.2466, + "grad_norm": 0.9119403770842963, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 24660 + }, + { + "epoch": 0.24661, + "grad_norm": 1.0059259801339178, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 24661 + }, + { + "epoch": 0.24662, + "grad_norm": 1.144988246335087, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 24662 + }, + { + "epoch": 0.24663, + "grad_norm": 1.0126773677529235, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 24663 + }, + { + "epoch": 0.24664, + "grad_norm": 1.23661429732905, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 24664 + }, + { + "epoch": 0.24665, + "grad_norm": 0.8853960918435931, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 24665 + }, + { + "epoch": 0.24666, + "grad_norm": 0.8211520498673379, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 24666 + }, + { + "epoch": 0.24667, + "grad_norm": 0.8818568812447078, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 24667 + }, + { + "epoch": 0.24668, + "grad_norm": 1.029482335682524, + "learning_rate": 0.003, + "loss": 4.061, + "step": 24668 + }, + { + "epoch": 0.24669, + "grad_norm": 1.2413643237833103, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 24669 + }, + { + "epoch": 0.2467, + "grad_norm": 0.9163924612219522, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 24670 + }, + { + "epoch": 0.24671, + "grad_norm": 0.8658217052942775, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 24671 + }, + { + "epoch": 0.24672, + "grad_norm": 0.9735428662871606, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 24672 + }, + { + "epoch": 0.24673, + "grad_norm": 1.0538013419192105, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 24673 + }, + { + "epoch": 0.24674, + "grad_norm": 0.9005057573009775, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 24674 + }, + { + "epoch": 0.24675, + "grad_norm": 0.9036759118640024, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 24675 + }, + { + "epoch": 0.24676, + "grad_norm": 0.9341753418882119, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 24676 + }, + { + "epoch": 0.24677, + "grad_norm": 0.9530372226742692, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 24677 + }, + { + "epoch": 0.24678, + "grad_norm": 1.0492651950375116, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 24678 + }, + { + "epoch": 0.24679, + "grad_norm": 0.9808535249134382, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 24679 + }, + { + "epoch": 0.2468, + "grad_norm": 0.9043372436810647, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 24680 + }, + { + "epoch": 0.24681, + "grad_norm": 0.8333167760827939, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 24681 + }, + { + "epoch": 0.24682, + "grad_norm": 0.7305892410341606, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 24682 + }, + { + "epoch": 0.24683, + "grad_norm": 0.8182417796276454, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 24683 + }, + { + "epoch": 0.24684, + "grad_norm": 0.8962203238233069, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 24684 + }, + { + "epoch": 0.24685, + "grad_norm": 1.022006486814936, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 24685 + }, + { + "epoch": 0.24686, + "grad_norm": 0.8235361509699674, + "learning_rate": 0.003, + "loss": 4.061, + "step": 24686 + }, + { + "epoch": 0.24687, + "grad_norm": 0.8322128349856641, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 24687 + }, + { + "epoch": 0.24688, + "grad_norm": 0.947593731940025, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 24688 + }, + { + "epoch": 0.24689, + "grad_norm": 1.0744293197372679, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 24689 + }, + { + "epoch": 0.2469, + "grad_norm": 1.0719831977834464, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 24690 + }, + { + "epoch": 0.24691, + "grad_norm": 1.009321321922841, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 24691 + }, + { + "epoch": 0.24692, + "grad_norm": 1.0086298575503758, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 24692 + }, + { + "epoch": 0.24693, + "grad_norm": 0.8484700218957829, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 24693 + }, + { + "epoch": 0.24694, + "grad_norm": 0.7011939122472073, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 24694 + }, + { + "epoch": 0.24695, + "grad_norm": 0.6973174542007287, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 24695 + }, + { + "epoch": 0.24696, + "grad_norm": 0.6460009765212417, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 24696 + }, + { + "epoch": 0.24697, + "grad_norm": 0.5822610636416075, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 24697 + }, + { + "epoch": 0.24698, + "grad_norm": 0.6295102264070797, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 24698 + }, + { + "epoch": 0.24699, + "grad_norm": 0.8215623491772215, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 24699 + }, + { + "epoch": 0.247, + "grad_norm": 1.1065741248396577, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 24700 + }, + { + "epoch": 0.24701, + "grad_norm": 1.0704589969820588, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 24701 + }, + { + "epoch": 0.24702, + "grad_norm": 0.96409858853508, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 24702 + }, + { + "epoch": 0.24703, + "grad_norm": 0.995248603870337, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 24703 + }, + { + "epoch": 0.24704, + "grad_norm": 1.0169583464904324, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 24704 + }, + { + "epoch": 0.24705, + "grad_norm": 0.92935327420754, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 24705 + }, + { + "epoch": 0.24706, + "grad_norm": 0.8762743078953602, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 24706 + }, + { + "epoch": 0.24707, + "grad_norm": 0.888311788992918, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 24707 + }, + { + "epoch": 0.24708, + "grad_norm": 0.9920518993548603, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 24708 + }, + { + "epoch": 0.24709, + "grad_norm": 1.0064818585982882, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 24709 + }, + { + "epoch": 0.2471, + "grad_norm": 0.9764175218133553, + "learning_rate": 0.003, + "loss": 4.057, + "step": 24710 + }, + { + "epoch": 0.24711, + "grad_norm": 1.030980800256014, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 24711 + }, + { + "epoch": 0.24712, + "grad_norm": 0.9028728185396866, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 24712 + }, + { + "epoch": 0.24713, + "grad_norm": 0.9469948178538513, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 24713 + }, + { + "epoch": 0.24714, + "grad_norm": 0.9417309166030591, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 24714 + }, + { + "epoch": 0.24715, + "grad_norm": 0.9080061121964882, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 24715 + }, + { + "epoch": 0.24716, + "grad_norm": 1.1144677976787054, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 24716 + }, + { + "epoch": 0.24717, + "grad_norm": 1.0757300325956651, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 24717 + }, + { + "epoch": 0.24718, + "grad_norm": 1.0115350298444612, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 24718 + }, + { + "epoch": 0.24719, + "grad_norm": 1.0042228857292683, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 24719 + }, + { + "epoch": 0.2472, + "grad_norm": 0.8617575876170146, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 24720 + }, + { + "epoch": 0.24721, + "grad_norm": 0.6371764931692104, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 24721 + }, + { + "epoch": 0.24722, + "grad_norm": 0.7827463665470048, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 24722 + }, + { + "epoch": 0.24723, + "grad_norm": 0.7877463881592692, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 24723 + }, + { + "epoch": 0.24724, + "grad_norm": 0.6977610093299308, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 24724 + }, + { + "epoch": 0.24725, + "grad_norm": 0.658963242077991, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 24725 + }, + { + "epoch": 0.24726, + "grad_norm": 0.7264138519782811, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 24726 + }, + { + "epoch": 0.24727, + "grad_norm": 0.7566545850125642, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 24727 + }, + { + "epoch": 0.24728, + "grad_norm": 0.6479300315513504, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 24728 + }, + { + "epoch": 0.24729, + "grad_norm": 0.6615274006260478, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 24729 + }, + { + "epoch": 0.2473, + "grad_norm": 0.6398172779962713, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 24730 + }, + { + "epoch": 0.24731, + "grad_norm": 0.6799161993424543, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 24731 + }, + { + "epoch": 0.24732, + "grad_norm": 0.9864726102491923, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 24732 + }, + { + "epoch": 0.24733, + "grad_norm": 1.4206291095977883, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 24733 + }, + { + "epoch": 0.24734, + "grad_norm": 0.5077966152350878, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 24734 + }, + { + "epoch": 0.24735, + "grad_norm": 0.84335173414029, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 24735 + }, + { + "epoch": 0.24736, + "grad_norm": 1.0037136452923288, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 24736 + }, + { + "epoch": 0.24737, + "grad_norm": 1.0332639285660383, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 24737 + }, + { + "epoch": 0.24738, + "grad_norm": 0.9890618706340799, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 24738 + }, + { + "epoch": 0.24739, + "grad_norm": 0.7160421752294072, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 24739 + }, + { + "epoch": 0.2474, + "grad_norm": 0.6945554613381376, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 24740 + }, + { + "epoch": 0.24741, + "grad_norm": 0.7168899707682601, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 24741 + }, + { + "epoch": 0.24742, + "grad_norm": 0.6894617129340092, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 24742 + }, + { + "epoch": 0.24743, + "grad_norm": 0.6167657059677037, + "learning_rate": 0.003, + "loss": 4.028, + "step": 24743 + }, + { + "epoch": 0.24744, + "grad_norm": 0.6147783105928425, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 24744 + }, + { + "epoch": 0.24745, + "grad_norm": 0.5732554180329266, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 24745 + }, + { + "epoch": 0.24746, + "grad_norm": 0.5866926273942326, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 24746 + }, + { + "epoch": 0.24747, + "grad_norm": 0.657803947560524, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 24747 + }, + { + "epoch": 0.24748, + "grad_norm": 0.7258347385420751, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 24748 + }, + { + "epoch": 0.24749, + "grad_norm": 0.8354675692037408, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 24749 + }, + { + "epoch": 0.2475, + "grad_norm": 1.050702796588416, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 24750 + }, + { + "epoch": 0.24751, + "grad_norm": 1.2221512772746002, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 24751 + }, + { + "epoch": 0.24752, + "grad_norm": 0.8878833253257229, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 24752 + }, + { + "epoch": 0.24753, + "grad_norm": 0.891905601398202, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 24753 + }, + { + "epoch": 0.24754, + "grad_norm": 0.8954660906896276, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 24754 + }, + { + "epoch": 0.24755, + "grad_norm": 0.8081400097737341, + "learning_rate": 0.003, + "loss": 4.003, + "step": 24755 + }, + { + "epoch": 0.24756, + "grad_norm": 0.7745240128448215, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 24756 + }, + { + "epoch": 0.24757, + "grad_norm": 0.8066309571613186, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 24757 + }, + { + "epoch": 0.24758, + "grad_norm": 0.8822188390264811, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 24758 + }, + { + "epoch": 0.24759, + "grad_norm": 1.066529063953866, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 24759 + }, + { + "epoch": 0.2476, + "grad_norm": 1.0892137216793414, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 24760 + }, + { + "epoch": 0.24761, + "grad_norm": 0.9048657406616295, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 24761 + }, + { + "epoch": 0.24762, + "grad_norm": 0.9287335087150327, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 24762 + }, + { + "epoch": 0.24763, + "grad_norm": 0.967006744373872, + "learning_rate": 0.003, + "loss": 4.075, + "step": 24763 + }, + { + "epoch": 0.24764, + "grad_norm": 0.9574266884551479, + "learning_rate": 0.003, + "loss": 4.076, + "step": 24764 + }, + { + "epoch": 0.24765, + "grad_norm": 0.9334459746376527, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 24765 + }, + { + "epoch": 0.24766, + "grad_norm": 0.9955984347985325, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 24766 + }, + { + "epoch": 0.24767, + "grad_norm": 1.3725984115284247, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 24767 + }, + { + "epoch": 0.24768, + "grad_norm": 0.8897627354191866, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 24768 + }, + { + "epoch": 0.24769, + "grad_norm": 0.8873010424819779, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 24769 + }, + { + "epoch": 0.2477, + "grad_norm": 0.9025217673476289, + "learning_rate": 0.003, + "loss": 4.073, + "step": 24770 + }, + { + "epoch": 0.24771, + "grad_norm": 0.9828900103483713, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 24771 + }, + { + "epoch": 0.24772, + "grad_norm": 1.1461005440890513, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 24772 + }, + { + "epoch": 0.24773, + "grad_norm": 0.8271918024365653, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 24773 + }, + { + "epoch": 0.24774, + "grad_norm": 0.8317913632171102, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 24774 + }, + { + "epoch": 0.24775, + "grad_norm": 0.9515172200959463, + "learning_rate": 0.003, + "loss": 4.041, + "step": 24775 + }, + { + "epoch": 0.24776, + "grad_norm": 1.0600256384223756, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 24776 + }, + { + "epoch": 0.24777, + "grad_norm": 0.9347496944658488, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 24777 + }, + { + "epoch": 0.24778, + "grad_norm": 0.9360542863318813, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 24778 + }, + { + "epoch": 0.24779, + "grad_norm": 0.8722014137978014, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 24779 + }, + { + "epoch": 0.2478, + "grad_norm": 0.9267173825630988, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 24780 + }, + { + "epoch": 0.24781, + "grad_norm": 1.0443956534226608, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 24781 + }, + { + "epoch": 0.24782, + "grad_norm": 1.0717018137122372, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 24782 + }, + { + "epoch": 0.24783, + "grad_norm": 0.9413876208645124, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 24783 + }, + { + "epoch": 0.24784, + "grad_norm": 0.9214263540762326, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 24784 + }, + { + "epoch": 0.24785, + "grad_norm": 0.9538147781170828, + "learning_rate": 0.003, + "loss": 4.057, + "step": 24785 + }, + { + "epoch": 0.24786, + "grad_norm": 1.0071186243550303, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 24786 + }, + { + "epoch": 0.24787, + "grad_norm": 0.9744871374873619, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 24787 + }, + { + "epoch": 0.24788, + "grad_norm": 0.783671659155178, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 24788 + }, + { + "epoch": 0.24789, + "grad_norm": 0.7978079683295433, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 24789 + }, + { + "epoch": 0.2479, + "grad_norm": 0.7212399446965075, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 24790 + }, + { + "epoch": 0.24791, + "grad_norm": 0.8788970269379971, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 24791 + }, + { + "epoch": 0.24792, + "grad_norm": 1.1371002890939632, + "learning_rate": 0.003, + "loss": 4.047, + "step": 24792 + }, + { + "epoch": 0.24793, + "grad_norm": 0.93647985161739, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 24793 + }, + { + "epoch": 0.24794, + "grad_norm": 0.8738315150522239, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 24794 + }, + { + "epoch": 0.24795, + "grad_norm": 0.8016735992864765, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 24795 + }, + { + "epoch": 0.24796, + "grad_norm": 0.7688918003058197, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 24796 + }, + { + "epoch": 0.24797, + "grad_norm": 0.7537986058755989, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 24797 + }, + { + "epoch": 0.24798, + "grad_norm": 0.6696285599736962, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 24798 + }, + { + "epoch": 0.24799, + "grad_norm": 0.6615559594680294, + "learning_rate": 0.003, + "loss": 4.037, + "step": 24799 + }, + { + "epoch": 0.248, + "grad_norm": 0.6497014484506389, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 24800 + }, + { + "epoch": 0.24801, + "grad_norm": 0.5817485664092115, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 24801 + }, + { + "epoch": 0.24802, + "grad_norm": 0.6355465096765639, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 24802 + }, + { + "epoch": 0.24803, + "grad_norm": 0.73560123039149, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 24803 + }, + { + "epoch": 0.24804, + "grad_norm": 0.8284279396712518, + "learning_rate": 0.003, + "loss": 4.069, + "step": 24804 + }, + { + "epoch": 0.24805, + "grad_norm": 1.0745509499017416, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 24805 + }, + { + "epoch": 0.24806, + "grad_norm": 1.1999133801130832, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 24806 + }, + { + "epoch": 0.24807, + "grad_norm": 0.6550126467021705, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 24807 + }, + { + "epoch": 0.24808, + "grad_norm": 0.5559945393777279, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 24808 + }, + { + "epoch": 0.24809, + "grad_norm": 0.6468984513478265, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 24809 + }, + { + "epoch": 0.2481, + "grad_norm": 0.6239437965266489, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 24810 + }, + { + "epoch": 0.24811, + "grad_norm": 0.6365183078204011, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 24811 + }, + { + "epoch": 0.24812, + "grad_norm": 0.8547635354364749, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 24812 + }, + { + "epoch": 0.24813, + "grad_norm": 1.0382798659209185, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 24813 + }, + { + "epoch": 0.24814, + "grad_norm": 0.9577500950739808, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 24814 + }, + { + "epoch": 0.24815, + "grad_norm": 0.9778281621053052, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 24815 + }, + { + "epoch": 0.24816, + "grad_norm": 1.203667926338086, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 24816 + }, + { + "epoch": 0.24817, + "grad_norm": 1.0414492207553028, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 24817 + }, + { + "epoch": 0.24818, + "grad_norm": 0.9908089619771676, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 24818 + }, + { + "epoch": 0.24819, + "grad_norm": 0.9557612358848938, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 24819 + }, + { + "epoch": 0.2482, + "grad_norm": 0.9632243454343027, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 24820 + }, + { + "epoch": 0.24821, + "grad_norm": 0.9270575289010755, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 24821 + }, + { + "epoch": 0.24822, + "grad_norm": 1.1387627243666694, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 24822 + }, + { + "epoch": 0.24823, + "grad_norm": 1.0276329289393864, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 24823 + }, + { + "epoch": 0.24824, + "grad_norm": 0.9504317077327361, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 24824 + }, + { + "epoch": 0.24825, + "grad_norm": 0.983997143480041, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 24825 + }, + { + "epoch": 0.24826, + "grad_norm": 1.2518179359624375, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 24826 + }, + { + "epoch": 0.24827, + "grad_norm": 0.8497038585910796, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 24827 + }, + { + "epoch": 0.24828, + "grad_norm": 0.6919839942540301, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 24828 + }, + { + "epoch": 0.24829, + "grad_norm": 0.6125319858387864, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 24829 + }, + { + "epoch": 0.2483, + "grad_norm": 0.5794198150584007, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 24830 + }, + { + "epoch": 0.24831, + "grad_norm": 0.5728331087813103, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 24831 + }, + { + "epoch": 0.24832, + "grad_norm": 0.6231918215982206, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 24832 + }, + { + "epoch": 0.24833, + "grad_norm": 0.6255952264934582, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 24833 + }, + { + "epoch": 0.24834, + "grad_norm": 0.6504108732818489, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 24834 + }, + { + "epoch": 0.24835, + "grad_norm": 0.7072904200647722, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 24835 + }, + { + "epoch": 0.24836, + "grad_norm": 0.7735055408247175, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 24836 + }, + { + "epoch": 0.24837, + "grad_norm": 0.8877538511342092, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 24837 + }, + { + "epoch": 0.24838, + "grad_norm": 0.9839276414632111, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 24838 + }, + { + "epoch": 0.24839, + "grad_norm": 0.9879033699394432, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 24839 + }, + { + "epoch": 0.2484, + "grad_norm": 1.1456824476675491, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 24840 + }, + { + "epoch": 0.24841, + "grad_norm": 0.7949750677248135, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 24841 + }, + { + "epoch": 0.24842, + "grad_norm": 0.6789789405730079, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 24842 + }, + { + "epoch": 0.24843, + "grad_norm": 0.6868752078579549, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 24843 + }, + { + "epoch": 0.24844, + "grad_norm": 0.7227304920843479, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 24844 + }, + { + "epoch": 0.24845, + "grad_norm": 0.8076739281818901, + "learning_rate": 0.003, + "loss": 4.061, + "step": 24845 + }, + { + "epoch": 0.24846, + "grad_norm": 0.9065040681895562, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 24846 + }, + { + "epoch": 0.24847, + "grad_norm": 1.0696085890558287, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 24847 + }, + { + "epoch": 0.24848, + "grad_norm": 1.2031992950138768, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 24848 + }, + { + "epoch": 0.24849, + "grad_norm": 1.0716571661164402, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 24849 + }, + { + "epoch": 0.2485, + "grad_norm": 1.0035113937137332, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 24850 + }, + { + "epoch": 0.24851, + "grad_norm": 1.0420199852033427, + "learning_rate": 0.003, + "loss": 4.045, + "step": 24851 + }, + { + "epoch": 0.24852, + "grad_norm": 1.0191196344070392, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 24852 + }, + { + "epoch": 0.24853, + "grad_norm": 0.9571857445906612, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 24853 + }, + { + "epoch": 0.24854, + "grad_norm": 0.9325267228249841, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 24854 + }, + { + "epoch": 0.24855, + "grad_norm": 1.1446378646908675, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 24855 + }, + { + "epoch": 0.24856, + "grad_norm": 0.9968365980358628, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 24856 + }, + { + "epoch": 0.24857, + "grad_norm": 1.0245139920884605, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 24857 + }, + { + "epoch": 0.24858, + "grad_norm": 1.023267065835406, + "learning_rate": 0.003, + "loss": 4.058, + "step": 24858 + }, + { + "epoch": 0.24859, + "grad_norm": 0.9163418620445073, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 24859 + }, + { + "epoch": 0.2486, + "grad_norm": 0.8688354713394478, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 24860 + }, + { + "epoch": 0.24861, + "grad_norm": 0.7638953468694308, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 24861 + }, + { + "epoch": 0.24862, + "grad_norm": 0.775892936801226, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 24862 + }, + { + "epoch": 0.24863, + "grad_norm": 0.9414704002418842, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 24863 + }, + { + "epoch": 0.24864, + "grad_norm": 1.1069737548040102, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 24864 + }, + { + "epoch": 0.24865, + "grad_norm": 0.9627236081944984, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 24865 + }, + { + "epoch": 0.24866, + "grad_norm": 1.1142266001833596, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 24866 + }, + { + "epoch": 0.24867, + "grad_norm": 0.877953098865027, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 24867 + }, + { + "epoch": 0.24868, + "grad_norm": 0.7929952625450133, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 24868 + }, + { + "epoch": 0.24869, + "grad_norm": 0.8337173579590729, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 24869 + }, + { + "epoch": 0.2487, + "grad_norm": 0.9784999540984296, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 24870 + }, + { + "epoch": 0.24871, + "grad_norm": 0.9147725816951395, + "learning_rate": 0.003, + "loss": 4.057, + "step": 24871 + }, + { + "epoch": 0.24872, + "grad_norm": 0.9431863496340517, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 24872 + }, + { + "epoch": 0.24873, + "grad_norm": 0.9740491400547014, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 24873 + }, + { + "epoch": 0.24874, + "grad_norm": 0.9598012206746511, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 24874 + }, + { + "epoch": 0.24875, + "grad_norm": 0.9626385728458898, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 24875 + }, + { + "epoch": 0.24876, + "grad_norm": 1.100486765283263, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 24876 + }, + { + "epoch": 0.24877, + "grad_norm": 0.8755189818545953, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 24877 + }, + { + "epoch": 0.24878, + "grad_norm": 0.7676829437375242, + "learning_rate": 0.003, + "loss": 4.031, + "step": 24878 + }, + { + "epoch": 0.24879, + "grad_norm": 0.686228247796003, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 24879 + }, + { + "epoch": 0.2488, + "grad_norm": 0.6967292236037987, + "learning_rate": 0.003, + "loss": 4.038, + "step": 24880 + }, + { + "epoch": 0.24881, + "grad_norm": 0.8175813489031121, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 24881 + }, + { + "epoch": 0.24882, + "grad_norm": 0.886370468618304, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 24882 + }, + { + "epoch": 0.24883, + "grad_norm": 0.9965065779238221, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 24883 + }, + { + "epoch": 0.24884, + "grad_norm": 1.0001017372960763, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 24884 + }, + { + "epoch": 0.24885, + "grad_norm": 0.8079299682451483, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 24885 + }, + { + "epoch": 0.24886, + "grad_norm": 0.7920299518496763, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 24886 + }, + { + "epoch": 0.24887, + "grad_norm": 0.8279145887675554, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 24887 + }, + { + "epoch": 0.24888, + "grad_norm": 0.7469662010775341, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 24888 + }, + { + "epoch": 0.24889, + "grad_norm": 0.7367691468173052, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 24889 + }, + { + "epoch": 0.2489, + "grad_norm": 0.7782965206923067, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 24890 + }, + { + "epoch": 0.24891, + "grad_norm": 0.8188408684244228, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 24891 + }, + { + "epoch": 0.24892, + "grad_norm": 0.8440261633842542, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 24892 + }, + { + "epoch": 0.24893, + "grad_norm": 0.9606538215243803, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 24893 + }, + { + "epoch": 0.24894, + "grad_norm": 1.2530527302284533, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 24894 + }, + { + "epoch": 0.24895, + "grad_norm": 0.8797449775830343, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 24895 + }, + { + "epoch": 0.24896, + "grad_norm": 0.9188052800326935, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 24896 + }, + { + "epoch": 0.24897, + "grad_norm": 1.0491535537198742, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 24897 + }, + { + "epoch": 0.24898, + "grad_norm": 0.8943513368138608, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 24898 + }, + { + "epoch": 0.24899, + "grad_norm": 0.851867599968842, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 24899 + }, + { + "epoch": 0.249, + "grad_norm": 0.9601212192643067, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 24900 + }, + { + "epoch": 0.24901, + "grad_norm": 1.179622299882782, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 24901 + }, + { + "epoch": 0.24902, + "grad_norm": 0.8488716073711199, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 24902 + }, + { + "epoch": 0.24903, + "grad_norm": 0.7308817866111754, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 24903 + }, + { + "epoch": 0.24904, + "grad_norm": 0.7244690098561029, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 24904 + }, + { + "epoch": 0.24905, + "grad_norm": 0.6584451941518012, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 24905 + }, + { + "epoch": 0.24906, + "grad_norm": 0.7333743058979599, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 24906 + }, + { + "epoch": 0.24907, + "grad_norm": 0.7966410189561395, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 24907 + }, + { + "epoch": 0.24908, + "grad_norm": 0.9334433371618684, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 24908 + }, + { + "epoch": 0.24909, + "grad_norm": 1.0749466387909368, + "learning_rate": 0.003, + "loss": 4.061, + "step": 24909 + }, + { + "epoch": 0.2491, + "grad_norm": 0.9487453627887742, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 24910 + }, + { + "epoch": 0.24911, + "grad_norm": 1.0007955506965904, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 24911 + }, + { + "epoch": 0.24912, + "grad_norm": 1.0039564686318823, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 24912 + }, + { + "epoch": 0.24913, + "grad_norm": 1.0003759211214167, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 24913 + }, + { + "epoch": 0.24914, + "grad_norm": 0.9306396598207398, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 24914 + }, + { + "epoch": 0.24915, + "grad_norm": 0.8012488769211433, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 24915 + }, + { + "epoch": 0.24916, + "grad_norm": 0.8033458625520037, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 24916 + }, + { + "epoch": 0.24917, + "grad_norm": 0.801851353295159, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 24917 + }, + { + "epoch": 0.24918, + "grad_norm": 0.8531648412002176, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 24918 + }, + { + "epoch": 0.24919, + "grad_norm": 0.8115686677098268, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 24919 + }, + { + "epoch": 0.2492, + "grad_norm": 0.7656263463347808, + "learning_rate": 0.003, + "loss": 4.045, + "step": 24920 + }, + { + "epoch": 0.24921, + "grad_norm": 0.8188487924561175, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 24921 + }, + { + "epoch": 0.24922, + "grad_norm": 0.8795499923126378, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 24922 + }, + { + "epoch": 0.24923, + "grad_norm": 0.7395555329643584, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 24923 + }, + { + "epoch": 0.24924, + "grad_norm": 0.7288140675284586, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 24924 + }, + { + "epoch": 0.24925, + "grad_norm": 0.8203781303967768, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 24925 + }, + { + "epoch": 0.24926, + "grad_norm": 0.813997783854369, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 24926 + }, + { + "epoch": 0.24927, + "grad_norm": 0.8009930195595387, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 24927 + }, + { + "epoch": 0.24928, + "grad_norm": 0.8451900865691463, + "learning_rate": 0.003, + "loss": 4.063, + "step": 24928 + }, + { + "epoch": 0.24929, + "grad_norm": 0.9571485694909434, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 24929 + }, + { + "epoch": 0.2493, + "grad_norm": 1.1632393280145823, + "learning_rate": 0.003, + "loss": 4.037, + "step": 24930 + }, + { + "epoch": 0.24931, + "grad_norm": 1.0313534830884388, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 24931 + }, + { + "epoch": 0.24932, + "grad_norm": 1.1209655208497427, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 24932 + }, + { + "epoch": 0.24933, + "grad_norm": 1.084133027216186, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 24933 + }, + { + "epoch": 0.24934, + "grad_norm": 0.932391633903948, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 24934 + }, + { + "epoch": 0.24935, + "grad_norm": 0.8286007566347191, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 24935 + }, + { + "epoch": 0.24936, + "grad_norm": 0.7784070356969961, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 24936 + }, + { + "epoch": 0.24937, + "grad_norm": 0.7777675795002211, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 24937 + }, + { + "epoch": 0.24938, + "grad_norm": 0.8766752872675143, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 24938 + }, + { + "epoch": 0.24939, + "grad_norm": 0.8273996662328338, + "learning_rate": 0.003, + "loss": 4.059, + "step": 24939 + }, + { + "epoch": 0.2494, + "grad_norm": 0.9403034246012082, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 24940 + }, + { + "epoch": 0.24941, + "grad_norm": 1.1423149896106994, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 24941 + }, + { + "epoch": 0.24942, + "grad_norm": 1.035884751440043, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 24942 + }, + { + "epoch": 0.24943, + "grad_norm": 1.2471782884521638, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 24943 + }, + { + "epoch": 0.24944, + "grad_norm": 0.9759278201259436, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 24944 + }, + { + "epoch": 0.24945, + "grad_norm": 1.0042091292036346, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 24945 + }, + { + "epoch": 0.24946, + "grad_norm": 0.9481287782686568, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 24946 + }, + { + "epoch": 0.24947, + "grad_norm": 0.9582763274073257, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 24947 + }, + { + "epoch": 0.24948, + "grad_norm": 0.9119848983728647, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 24948 + }, + { + "epoch": 0.24949, + "grad_norm": 0.8319973091960529, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 24949 + }, + { + "epoch": 0.2495, + "grad_norm": 0.843193795345759, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 24950 + }, + { + "epoch": 0.24951, + "grad_norm": 0.8504347772802173, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 24951 + }, + { + "epoch": 0.24952, + "grad_norm": 0.9108638648891836, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 24952 + }, + { + "epoch": 0.24953, + "grad_norm": 0.8601271859555615, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 24953 + }, + { + "epoch": 0.24954, + "grad_norm": 0.9081774866326066, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 24954 + }, + { + "epoch": 0.24955, + "grad_norm": 0.9733720661267132, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 24955 + }, + { + "epoch": 0.24956, + "grad_norm": 0.92237190093115, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 24956 + }, + { + "epoch": 0.24957, + "grad_norm": 0.81505986901497, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 24957 + }, + { + "epoch": 0.24958, + "grad_norm": 0.774415666361524, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 24958 + }, + { + "epoch": 0.24959, + "grad_norm": 0.6938756128041287, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 24959 + }, + { + "epoch": 0.2496, + "grad_norm": 0.7412381256067151, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 24960 + }, + { + "epoch": 0.24961, + "grad_norm": 0.7520596050887787, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 24961 + }, + { + "epoch": 0.24962, + "grad_norm": 0.854951810178195, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 24962 + }, + { + "epoch": 0.24963, + "grad_norm": 1.0002367487907915, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 24963 + }, + { + "epoch": 0.24964, + "grad_norm": 1.1207272549093727, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 24964 + }, + { + "epoch": 0.24965, + "grad_norm": 0.729142390671697, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 24965 + }, + { + "epoch": 0.24966, + "grad_norm": 0.6233655498523967, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 24966 + }, + { + "epoch": 0.24967, + "grad_norm": 0.6350926012944166, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 24967 + }, + { + "epoch": 0.24968, + "grad_norm": 0.764886544532569, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 24968 + }, + { + "epoch": 0.24969, + "grad_norm": 0.8855675547198433, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 24969 + }, + { + "epoch": 0.2497, + "grad_norm": 0.9471713777068682, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 24970 + }, + { + "epoch": 0.24971, + "grad_norm": 0.880132568392059, + "learning_rate": 0.003, + "loss": 4.064, + "step": 24971 + }, + { + "epoch": 0.24972, + "grad_norm": 0.9267782852662799, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 24972 + }, + { + "epoch": 0.24973, + "grad_norm": 0.8460522414116444, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 24973 + }, + { + "epoch": 0.24974, + "grad_norm": 0.867101449897478, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 24974 + }, + { + "epoch": 0.24975, + "grad_norm": 0.9236297007084834, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 24975 + }, + { + "epoch": 0.24976, + "grad_norm": 0.9534470208657182, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 24976 + }, + { + "epoch": 0.24977, + "grad_norm": 0.9579467687298114, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 24977 + }, + { + "epoch": 0.24978, + "grad_norm": 1.0074596869477315, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 24978 + }, + { + "epoch": 0.24979, + "grad_norm": 1.1933870957283397, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 24979 + }, + { + "epoch": 0.2498, + "grad_norm": 0.9262414555758269, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 24980 + }, + { + "epoch": 0.24981, + "grad_norm": 1.0407926728673673, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 24981 + }, + { + "epoch": 0.24982, + "grad_norm": 1.0573723151059415, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 24982 + }, + { + "epoch": 0.24983, + "grad_norm": 0.998923768936595, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 24983 + }, + { + "epoch": 0.24984, + "grad_norm": 0.8679313654728963, + "learning_rate": 0.003, + "loss": 4.0058, + "step": 24984 + }, + { + "epoch": 0.24985, + "grad_norm": 0.8384879860214401, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 24985 + }, + { + "epoch": 0.24986, + "grad_norm": 0.9633055182038077, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 24986 + }, + { + "epoch": 0.24987, + "grad_norm": 1.070799731316395, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 24987 + }, + { + "epoch": 0.24988, + "grad_norm": 0.9470749894070539, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 24988 + }, + { + "epoch": 0.24989, + "grad_norm": 0.9558436825349234, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 24989 + }, + { + "epoch": 0.2499, + "grad_norm": 0.9846096362443845, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 24990 + }, + { + "epoch": 0.24991, + "grad_norm": 1.053019455540664, + "learning_rate": 0.003, + "loss": 4.062, + "step": 24991 + }, + { + "epoch": 0.24992, + "grad_norm": 0.9438830834941232, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 24992 + }, + { + "epoch": 0.24993, + "grad_norm": 0.8934186776989499, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 24993 + }, + { + "epoch": 0.24994, + "grad_norm": 0.9014277737825089, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 24994 + }, + { + "epoch": 0.24995, + "grad_norm": 0.8329123557886955, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 24995 + }, + { + "epoch": 0.24996, + "grad_norm": 0.8251514479363162, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 24996 + }, + { + "epoch": 0.24997, + "grad_norm": 0.7977269692120545, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 24997 + }, + { + "epoch": 0.24998, + "grad_norm": 0.6571353906713401, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 24998 + }, + { + "epoch": 0.24999, + "grad_norm": 0.7322523637202981, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 24999 + }, + { + "epoch": 0.25, + "grad_norm": 0.8412326268340425, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 25000 + }, + { + "epoch": 0.25001, + "grad_norm": 1.037524276671091, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 25001 + }, + { + "epoch": 0.25002, + "grad_norm": 1.0190557401502651, + "learning_rate": 0.003, + "loss": 4.035, + "step": 25002 + }, + { + "epoch": 0.25003, + "grad_norm": 1.083487939041908, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 25003 + }, + { + "epoch": 0.25004, + "grad_norm": 1.001186293158713, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 25004 + }, + { + "epoch": 0.25005, + "grad_norm": 0.6619402560882597, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 25005 + }, + { + "epoch": 0.25006, + "grad_norm": 0.7721198796401331, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 25006 + }, + { + "epoch": 0.25007, + "grad_norm": 0.9574069605386972, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 25007 + }, + { + "epoch": 0.25008, + "grad_norm": 1.2899009248707805, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 25008 + }, + { + "epoch": 0.25009, + "grad_norm": 1.2224013035021652, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 25009 + }, + { + "epoch": 0.2501, + "grad_norm": 1.0757192290835778, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 25010 + }, + { + "epoch": 0.25011, + "grad_norm": 0.9376665664113467, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 25011 + }, + { + "epoch": 0.25012, + "grad_norm": 0.8812841673479098, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 25012 + }, + { + "epoch": 0.25013, + "grad_norm": 0.9729970063704426, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 25013 + }, + { + "epoch": 0.25014, + "grad_norm": 1.072993005848804, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 25014 + }, + { + "epoch": 0.25015, + "grad_norm": 1.4911524103483254, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 25015 + }, + { + "epoch": 0.25016, + "grad_norm": 1.4401482924523783, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 25016 + }, + { + "epoch": 0.25017, + "grad_norm": 1.3798354667013213, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 25017 + }, + { + "epoch": 0.25018, + "grad_norm": 0.9626967197458514, + "learning_rate": 0.003, + "loss": 4.093, + "step": 25018 + }, + { + "epoch": 0.25019, + "grad_norm": 1.058305573669329, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 25019 + }, + { + "epoch": 0.2502, + "grad_norm": 1.043219965259639, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 25020 + }, + { + "epoch": 0.25021, + "grad_norm": 1.1496339136709215, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 25021 + }, + { + "epoch": 0.25022, + "grad_norm": 1.2164552872614778, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 25022 + }, + { + "epoch": 0.25023, + "grad_norm": 1.6243003789370114, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 25023 + }, + { + "epoch": 0.25024, + "grad_norm": 1.124096707417229, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 25024 + }, + { + "epoch": 0.25025, + "grad_norm": 1.0419774078006838, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 25025 + }, + { + "epoch": 0.25026, + "grad_norm": 0.9711366351677849, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 25026 + }, + { + "epoch": 0.25027, + "grad_norm": 0.97675025403763, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 25027 + }, + { + "epoch": 0.25028, + "grad_norm": 1.1109070886817412, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 25028 + }, + { + "epoch": 0.25029, + "grad_norm": 1.2172574584394815, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 25029 + }, + { + "epoch": 0.2503, + "grad_norm": 1.247965871887491, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 25030 + }, + { + "epoch": 0.25031, + "grad_norm": 0.7776165108217428, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 25031 + }, + { + "epoch": 0.25032, + "grad_norm": 0.8306721903717568, + "learning_rate": 0.003, + "loss": 4.084, + "step": 25032 + }, + { + "epoch": 0.25033, + "grad_norm": 0.9670834995339764, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 25033 + }, + { + "epoch": 0.25034, + "grad_norm": 1.0347753903428625, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 25034 + }, + { + "epoch": 0.25035, + "grad_norm": 0.9160684364908147, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 25035 + }, + { + "epoch": 0.25036, + "grad_norm": 0.8458236427949395, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 25036 + }, + { + "epoch": 0.25037, + "grad_norm": 1.1818064024537793, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 25037 + }, + { + "epoch": 0.25038, + "grad_norm": 1.1883236506527866, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 25038 + }, + { + "epoch": 0.25039, + "grad_norm": 1.2020523876367668, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 25039 + }, + { + "epoch": 0.2504, + "grad_norm": 0.8837722727470024, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 25040 + }, + { + "epoch": 0.25041, + "grad_norm": 0.8522048980155027, + "learning_rate": 0.003, + "loss": 4.091, + "step": 25041 + }, + { + "epoch": 0.25042, + "grad_norm": 0.9399330735825984, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 25042 + }, + { + "epoch": 0.25043, + "grad_norm": 1.0378991269960471, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 25043 + }, + { + "epoch": 0.25044, + "grad_norm": 0.929956005496625, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 25044 + }, + { + "epoch": 0.25045, + "grad_norm": 1.020457057304478, + "learning_rate": 0.003, + "loss": 4.055, + "step": 25045 + }, + { + "epoch": 0.25046, + "grad_norm": 1.140047710480014, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 25046 + }, + { + "epoch": 0.25047, + "grad_norm": 0.7902108998865288, + "learning_rate": 0.003, + "loss": 4.08, + "step": 25047 + }, + { + "epoch": 0.25048, + "grad_norm": 0.8971655814094962, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 25048 + }, + { + "epoch": 0.25049, + "grad_norm": 0.8922678756905887, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 25049 + }, + { + "epoch": 0.2505, + "grad_norm": 1.1519497631746742, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 25050 + }, + { + "epoch": 0.25051, + "grad_norm": 1.1195376673999218, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 25051 + }, + { + "epoch": 0.25052, + "grad_norm": 0.97888067076068, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 25052 + }, + { + "epoch": 0.25053, + "grad_norm": 0.8241755508775127, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 25053 + }, + { + "epoch": 0.25054, + "grad_norm": 0.6713725274741734, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 25054 + }, + { + "epoch": 0.25055, + "grad_norm": 0.6150041292698852, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 25055 + }, + { + "epoch": 0.25056, + "grad_norm": 0.6315935068627877, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 25056 + }, + { + "epoch": 0.25057, + "grad_norm": 0.7602159538029633, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 25057 + }, + { + "epoch": 0.25058, + "grad_norm": 0.916700254370856, + "learning_rate": 0.003, + "loss": 4.06, + "step": 25058 + }, + { + "epoch": 0.25059, + "grad_norm": 1.058707832673356, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 25059 + }, + { + "epoch": 0.2506, + "grad_norm": 1.0954966816091278, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 25060 + }, + { + "epoch": 0.25061, + "grad_norm": 0.9691939733883124, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 25061 + }, + { + "epoch": 0.25062, + "grad_norm": 0.8092999766323778, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 25062 + }, + { + "epoch": 0.25063, + "grad_norm": 0.8246555639580762, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 25063 + }, + { + "epoch": 0.25064, + "grad_norm": 0.7628830041794898, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 25064 + }, + { + "epoch": 0.25065, + "grad_norm": 0.7272746422352339, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 25065 + }, + { + "epoch": 0.25066, + "grad_norm": 0.6345537117820802, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 25066 + }, + { + "epoch": 0.25067, + "grad_norm": 0.4675959715614357, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 25067 + }, + { + "epoch": 0.25068, + "grad_norm": 0.4863201850031912, + "learning_rate": 0.003, + "loss": 4.0073, + "step": 25068 + }, + { + "epoch": 0.25069, + "grad_norm": 0.4617046091041555, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 25069 + }, + { + "epoch": 0.2507, + "grad_norm": 0.43825143815971773, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 25070 + }, + { + "epoch": 0.25071, + "grad_norm": 0.525905116417564, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 25071 + }, + { + "epoch": 0.25072, + "grad_norm": 0.668319837458617, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 25072 + }, + { + "epoch": 0.25073, + "grad_norm": 0.8422337250179343, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 25073 + }, + { + "epoch": 0.25074, + "grad_norm": 1.1816687225665465, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 25074 + }, + { + "epoch": 0.25075, + "grad_norm": 1.0020566381361342, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 25075 + }, + { + "epoch": 0.25076, + "grad_norm": 0.986729759270076, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 25076 + }, + { + "epoch": 0.25077, + "grad_norm": 1.00469383067617, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 25077 + }, + { + "epoch": 0.25078, + "grad_norm": 0.8837208467119414, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 25078 + }, + { + "epoch": 0.25079, + "grad_norm": 0.6824917933565879, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 25079 + }, + { + "epoch": 0.2508, + "grad_norm": 0.6198671270040899, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 25080 + }, + { + "epoch": 0.25081, + "grad_norm": 0.5443134229543437, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 25081 + }, + { + "epoch": 0.25082, + "grad_norm": 0.6004036647609302, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 25082 + }, + { + "epoch": 0.25083, + "grad_norm": 0.6508500130264369, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 25083 + }, + { + "epoch": 0.25084, + "grad_norm": 0.9005061338615322, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 25084 + }, + { + "epoch": 0.25085, + "grad_norm": 1.2109685128305554, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 25085 + }, + { + "epoch": 0.25086, + "grad_norm": 0.8615367253459664, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 25086 + }, + { + "epoch": 0.25087, + "grad_norm": 0.61490159543475, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 25087 + }, + { + "epoch": 0.25088, + "grad_norm": 0.5645078039744594, + "learning_rate": 0.003, + "loss": 4.062, + "step": 25088 + }, + { + "epoch": 0.25089, + "grad_norm": 0.6806136886781947, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 25089 + }, + { + "epoch": 0.2509, + "grad_norm": 0.8254573353259544, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 25090 + }, + { + "epoch": 0.25091, + "grad_norm": 0.9523313090582275, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 25091 + }, + { + "epoch": 0.25092, + "grad_norm": 1.0607135733791946, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 25092 + }, + { + "epoch": 0.25093, + "grad_norm": 0.9336317380898301, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 25093 + }, + { + "epoch": 0.25094, + "grad_norm": 0.9383989739073828, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 25094 + }, + { + "epoch": 0.25095, + "grad_norm": 0.976214928244242, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 25095 + }, + { + "epoch": 0.25096, + "grad_norm": 1.038526541165959, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 25096 + }, + { + "epoch": 0.25097, + "grad_norm": 1.129597483555832, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 25097 + }, + { + "epoch": 0.25098, + "grad_norm": 1.019362127675871, + "learning_rate": 0.003, + "loss": 4.047, + "step": 25098 + }, + { + "epoch": 0.25099, + "grad_norm": 0.8608281225298912, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 25099 + }, + { + "epoch": 0.251, + "grad_norm": 0.8012608490443469, + "learning_rate": 0.003, + "loss": 4.053, + "step": 25100 + }, + { + "epoch": 0.25101, + "grad_norm": 0.6842043070653451, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 25101 + }, + { + "epoch": 0.25102, + "grad_norm": 0.5740110129881074, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 25102 + }, + { + "epoch": 0.25103, + "grad_norm": 0.5147349471610092, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 25103 + }, + { + "epoch": 0.25104, + "grad_norm": 0.4756094369133143, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 25104 + }, + { + "epoch": 0.25105, + "grad_norm": 0.46513665124776143, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 25105 + }, + { + "epoch": 0.25106, + "grad_norm": 0.4329026291624255, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 25106 + }, + { + "epoch": 0.25107, + "grad_norm": 0.4567673634992878, + "learning_rate": 0.003, + "loss": 4.0027, + "step": 25107 + }, + { + "epoch": 0.25108, + "grad_norm": 0.4936691979667193, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 25108 + }, + { + "epoch": 0.25109, + "grad_norm": 0.5902121576068349, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 25109 + }, + { + "epoch": 0.2511, + "grad_norm": 0.8069945609455702, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 25110 + }, + { + "epoch": 0.25111, + "grad_norm": 1.136310175425401, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 25111 + }, + { + "epoch": 0.25112, + "grad_norm": 1.1967387967538097, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 25112 + }, + { + "epoch": 0.25113, + "grad_norm": 0.7619060803032691, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 25113 + }, + { + "epoch": 0.25114, + "grad_norm": 0.8071813860116033, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 25114 + }, + { + "epoch": 0.25115, + "grad_norm": 0.9339821731483683, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 25115 + }, + { + "epoch": 0.25116, + "grad_norm": 0.8957828945875281, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 25116 + }, + { + "epoch": 0.25117, + "grad_norm": 0.8617330471773642, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 25117 + }, + { + "epoch": 0.25118, + "grad_norm": 0.9180024376747362, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 25118 + }, + { + "epoch": 0.25119, + "grad_norm": 0.8684379685402638, + "learning_rate": 0.003, + "loss": 4.0058, + "step": 25119 + }, + { + "epoch": 0.2512, + "grad_norm": 0.9286283278573202, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 25120 + }, + { + "epoch": 0.25121, + "grad_norm": 0.9401947200061271, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 25121 + }, + { + "epoch": 0.25122, + "grad_norm": 0.8834515685729514, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 25122 + }, + { + "epoch": 0.25123, + "grad_norm": 0.7750276533630232, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 25123 + }, + { + "epoch": 0.25124, + "grad_norm": 0.8631769841365771, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 25124 + }, + { + "epoch": 0.25125, + "grad_norm": 1.1906761217962791, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 25125 + }, + { + "epoch": 0.25126, + "grad_norm": 1.0720401482892024, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 25126 + }, + { + "epoch": 0.25127, + "grad_norm": 0.8767545569957969, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 25127 + }, + { + "epoch": 0.25128, + "grad_norm": 0.7863285916794128, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 25128 + }, + { + "epoch": 0.25129, + "grad_norm": 0.7340058131336088, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 25129 + }, + { + "epoch": 0.2513, + "grad_norm": 0.6875929322762984, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 25130 + }, + { + "epoch": 0.25131, + "grad_norm": 0.599577439650636, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 25131 + }, + { + "epoch": 0.25132, + "grad_norm": 0.690679847366617, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 25132 + }, + { + "epoch": 0.25133, + "grad_norm": 0.6529766175097188, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 25133 + }, + { + "epoch": 0.25134, + "grad_norm": 0.5916883964116354, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 25134 + }, + { + "epoch": 0.25135, + "grad_norm": 0.57167109764271, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 25135 + }, + { + "epoch": 0.25136, + "grad_norm": 0.668114027669172, + "learning_rate": 0.003, + "loss": 3.9963, + "step": 25136 + }, + { + "epoch": 0.25137, + "grad_norm": 0.8738805137224449, + "learning_rate": 0.003, + "loss": 4.047, + "step": 25137 + }, + { + "epoch": 0.25138, + "grad_norm": 1.1286847828099993, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 25138 + }, + { + "epoch": 0.25139, + "grad_norm": 1.0587493220005495, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 25139 + }, + { + "epoch": 0.2514, + "grad_norm": 0.9975101547437617, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 25140 + }, + { + "epoch": 0.25141, + "grad_norm": 0.9880287927954604, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 25141 + }, + { + "epoch": 0.25142, + "grad_norm": 0.9229511689269122, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 25142 + }, + { + "epoch": 0.25143, + "grad_norm": 0.8843976777708843, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 25143 + }, + { + "epoch": 0.25144, + "grad_norm": 0.962221493395311, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 25144 + }, + { + "epoch": 0.25145, + "grad_norm": 0.9698641348802396, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 25145 + }, + { + "epoch": 0.25146, + "grad_norm": 1.0569166111746384, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 25146 + }, + { + "epoch": 0.25147, + "grad_norm": 1.2123511031266783, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 25147 + }, + { + "epoch": 0.25148, + "grad_norm": 0.9203625784837228, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 25148 + }, + { + "epoch": 0.25149, + "grad_norm": 0.8211277267162767, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 25149 + }, + { + "epoch": 0.2515, + "grad_norm": 0.9898899201348818, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 25150 + }, + { + "epoch": 0.25151, + "grad_norm": 1.1705211043169956, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 25151 + }, + { + "epoch": 0.25152, + "grad_norm": 0.9526808882868566, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 25152 + }, + { + "epoch": 0.25153, + "grad_norm": 0.8389608061512351, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 25153 + }, + { + "epoch": 0.25154, + "grad_norm": 0.7533982375758314, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 25154 + }, + { + "epoch": 0.25155, + "grad_norm": 0.8544760858647533, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 25155 + }, + { + "epoch": 0.25156, + "grad_norm": 0.8828070352273748, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 25156 + }, + { + "epoch": 0.25157, + "grad_norm": 0.9604626305653975, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 25157 + }, + { + "epoch": 0.25158, + "grad_norm": 1.048648378825132, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 25158 + }, + { + "epoch": 0.25159, + "grad_norm": 1.0833908511434693, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 25159 + }, + { + "epoch": 0.2516, + "grad_norm": 0.8408758870744419, + "learning_rate": 0.003, + "loss": 4.03, + "step": 25160 + }, + { + "epoch": 0.25161, + "grad_norm": 0.7556336457631874, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 25161 + }, + { + "epoch": 0.25162, + "grad_norm": 0.7293796072226307, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 25162 + }, + { + "epoch": 0.25163, + "grad_norm": 0.709286265984986, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 25163 + }, + { + "epoch": 0.25164, + "grad_norm": 0.6638362614172476, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 25164 + }, + { + "epoch": 0.25165, + "grad_norm": 0.6338064291897981, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 25165 + }, + { + "epoch": 0.25166, + "grad_norm": 0.6816138402393093, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 25166 + }, + { + "epoch": 0.25167, + "grad_norm": 0.795382936425935, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 25167 + }, + { + "epoch": 0.25168, + "grad_norm": 0.9295426899641294, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 25168 + }, + { + "epoch": 0.25169, + "grad_norm": 0.9547852532356386, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 25169 + }, + { + "epoch": 0.2517, + "grad_norm": 0.9005290815211721, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 25170 + }, + { + "epoch": 0.25171, + "grad_norm": 0.8003418463630907, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 25171 + }, + { + "epoch": 0.25172, + "grad_norm": 0.7098177762206411, + "learning_rate": 0.003, + "loss": 4.036, + "step": 25172 + }, + { + "epoch": 0.25173, + "grad_norm": 0.7087210930518817, + "learning_rate": 0.003, + "loss": 3.987, + "step": 25173 + }, + { + "epoch": 0.25174, + "grad_norm": 0.6279601183392163, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 25174 + }, + { + "epoch": 0.25175, + "grad_norm": 0.688130799164182, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 25175 + }, + { + "epoch": 0.25176, + "grad_norm": 0.7765466121227397, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 25176 + }, + { + "epoch": 0.25177, + "grad_norm": 0.940906905031278, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 25177 + }, + { + "epoch": 0.25178, + "grad_norm": 1.075773281279824, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 25178 + }, + { + "epoch": 0.25179, + "grad_norm": 0.9841992918270726, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 25179 + }, + { + "epoch": 0.2518, + "grad_norm": 0.9369489061134327, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 25180 + }, + { + "epoch": 0.25181, + "grad_norm": 0.8248480057966866, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 25181 + }, + { + "epoch": 0.25182, + "grad_norm": 0.9892215434358935, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 25182 + }, + { + "epoch": 0.25183, + "grad_norm": 0.9207774582261831, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 25183 + }, + { + "epoch": 0.25184, + "grad_norm": 0.8886669570125391, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 25184 + }, + { + "epoch": 0.25185, + "grad_norm": 0.7821355678708054, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 25185 + }, + { + "epoch": 0.25186, + "grad_norm": 0.8901906377624766, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 25186 + }, + { + "epoch": 0.25187, + "grad_norm": 0.9900567139605738, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 25187 + }, + { + "epoch": 0.25188, + "grad_norm": 1.1305716629622564, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 25188 + }, + { + "epoch": 0.25189, + "grad_norm": 0.7542767343839198, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 25189 + }, + { + "epoch": 0.2519, + "grad_norm": 0.6027405628789158, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 25190 + }, + { + "epoch": 0.25191, + "grad_norm": 0.7246862571682013, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 25191 + }, + { + "epoch": 0.25192, + "grad_norm": 0.774040854479532, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 25192 + }, + { + "epoch": 0.25193, + "grad_norm": 0.6341528986659734, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 25193 + }, + { + "epoch": 0.25194, + "grad_norm": 0.5694056197889509, + "learning_rate": 0.003, + "loss": 4.01, + "step": 25194 + }, + { + "epoch": 0.25195, + "grad_norm": 0.7739948032594215, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 25195 + }, + { + "epoch": 0.25196, + "grad_norm": 0.9624965736244836, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 25196 + }, + { + "epoch": 0.25197, + "grad_norm": 1.070892106503341, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 25197 + }, + { + "epoch": 0.25198, + "grad_norm": 0.9605315431678978, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 25198 + }, + { + "epoch": 0.25199, + "grad_norm": 0.9788966527264475, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 25199 + }, + { + "epoch": 0.252, + "grad_norm": 0.9424917208332682, + "learning_rate": 0.003, + "loss": 4.062, + "step": 25200 + }, + { + "epoch": 0.25201, + "grad_norm": 1.1661054463651646, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 25201 + }, + { + "epoch": 0.25202, + "grad_norm": 1.0190964944092762, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 25202 + }, + { + "epoch": 0.25203, + "grad_norm": 1.1096687047797764, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 25203 + }, + { + "epoch": 0.25204, + "grad_norm": 1.0279324933045393, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 25204 + }, + { + "epoch": 0.25205, + "grad_norm": 1.1370091227500148, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 25205 + }, + { + "epoch": 0.25206, + "grad_norm": 0.9181255689912888, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 25206 + }, + { + "epoch": 0.25207, + "grad_norm": 0.7704206828748278, + "learning_rate": 0.003, + "loss": 4.042, + "step": 25207 + }, + { + "epoch": 0.25208, + "grad_norm": 0.745834151731976, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 25208 + }, + { + "epoch": 0.25209, + "grad_norm": 0.6856649792357714, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 25209 + }, + { + "epoch": 0.2521, + "grad_norm": 0.7622430125826458, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 25210 + }, + { + "epoch": 0.25211, + "grad_norm": 0.7986538714644699, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 25211 + }, + { + "epoch": 0.25212, + "grad_norm": 0.8734018047260165, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 25212 + }, + { + "epoch": 0.25213, + "grad_norm": 0.8326016255756234, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 25213 + }, + { + "epoch": 0.25214, + "grad_norm": 0.7443003911106852, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 25214 + }, + { + "epoch": 0.25215, + "grad_norm": 0.7526355470218531, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 25215 + }, + { + "epoch": 0.25216, + "grad_norm": 0.9472294575923542, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 25216 + }, + { + "epoch": 0.25217, + "grad_norm": 1.2402170598118385, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 25217 + }, + { + "epoch": 0.25218, + "grad_norm": 0.9745632980354524, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 25218 + }, + { + "epoch": 0.25219, + "grad_norm": 0.973079921672595, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 25219 + }, + { + "epoch": 0.2522, + "grad_norm": 1.0026004606339565, + "learning_rate": 0.003, + "loss": 4.062, + "step": 25220 + }, + { + "epoch": 0.25221, + "grad_norm": 0.9005587341136152, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 25221 + }, + { + "epoch": 0.25222, + "grad_norm": 0.8087794163926014, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 25222 + }, + { + "epoch": 0.25223, + "grad_norm": 0.7121581125382069, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 25223 + }, + { + "epoch": 0.25224, + "grad_norm": 0.7424726687515298, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 25224 + }, + { + "epoch": 0.25225, + "grad_norm": 0.7476766293586138, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 25225 + }, + { + "epoch": 0.25226, + "grad_norm": 0.7346687257354678, + "learning_rate": 0.003, + "loss": 4.083, + "step": 25226 + }, + { + "epoch": 0.25227, + "grad_norm": 0.6869175734922605, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 25227 + }, + { + "epoch": 0.25228, + "grad_norm": 0.7498749359421119, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 25228 + }, + { + "epoch": 0.25229, + "grad_norm": 0.8699068122680356, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 25229 + }, + { + "epoch": 0.2523, + "grad_norm": 0.9198411467379168, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 25230 + }, + { + "epoch": 0.25231, + "grad_norm": 0.9380321912887235, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 25231 + }, + { + "epoch": 0.25232, + "grad_norm": 0.9905981867738339, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 25232 + }, + { + "epoch": 0.25233, + "grad_norm": 0.9430014506728069, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 25233 + }, + { + "epoch": 0.25234, + "grad_norm": 1.0044826588230238, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 25234 + }, + { + "epoch": 0.25235, + "grad_norm": 1.1855414762160477, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 25235 + }, + { + "epoch": 0.25236, + "grad_norm": 0.8237914038283526, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 25236 + }, + { + "epoch": 0.25237, + "grad_norm": 0.7198964430985642, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 25237 + }, + { + "epoch": 0.25238, + "grad_norm": 0.7088122263581368, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 25238 + }, + { + "epoch": 0.25239, + "grad_norm": 0.7691120843572459, + "learning_rate": 0.003, + "loss": 4.035, + "step": 25239 + }, + { + "epoch": 0.2524, + "grad_norm": 0.7349382923158768, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 25240 + }, + { + "epoch": 0.25241, + "grad_norm": 0.7921997771722659, + "learning_rate": 0.003, + "loss": 4.052, + "step": 25241 + }, + { + "epoch": 0.25242, + "grad_norm": 0.9768713384762246, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 25242 + }, + { + "epoch": 0.25243, + "grad_norm": 1.0542123636991634, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 25243 + }, + { + "epoch": 0.25244, + "grad_norm": 0.8276986442114519, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 25244 + }, + { + "epoch": 0.25245, + "grad_norm": 0.7357455725766722, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 25245 + }, + { + "epoch": 0.25246, + "grad_norm": 1.0319044598724785, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 25246 + }, + { + "epoch": 0.25247, + "grad_norm": 1.3120279592577566, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 25247 + }, + { + "epoch": 0.25248, + "grad_norm": 0.8286114938976565, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 25248 + }, + { + "epoch": 0.25249, + "grad_norm": 0.7851173587255428, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 25249 + }, + { + "epoch": 0.2525, + "grad_norm": 0.7484835280417966, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 25250 + }, + { + "epoch": 0.25251, + "grad_norm": 0.7262841719914204, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 25251 + }, + { + "epoch": 0.25252, + "grad_norm": 0.7762935232725945, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 25252 + }, + { + "epoch": 0.25253, + "grad_norm": 0.7348960292908512, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 25253 + }, + { + "epoch": 0.25254, + "grad_norm": 0.7490491515241705, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 25254 + }, + { + "epoch": 0.25255, + "grad_norm": 0.8354672467712169, + "learning_rate": 0.003, + "loss": 4.057, + "step": 25255 + }, + { + "epoch": 0.25256, + "grad_norm": 0.926177167624343, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 25256 + }, + { + "epoch": 0.25257, + "grad_norm": 1.0427794923489049, + "learning_rate": 0.003, + "loss": 4.056, + "step": 25257 + }, + { + "epoch": 0.25258, + "grad_norm": 1.0955999190626222, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 25258 + }, + { + "epoch": 0.25259, + "grad_norm": 1.056473983272084, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 25259 + }, + { + "epoch": 0.2526, + "grad_norm": 0.9659000814153147, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 25260 + }, + { + "epoch": 0.25261, + "grad_norm": 0.9590073496711436, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 25261 + }, + { + "epoch": 0.25262, + "grad_norm": 1.1558839791320852, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 25262 + }, + { + "epoch": 0.25263, + "grad_norm": 0.8344369479069749, + "learning_rate": 0.003, + "loss": 4.036, + "step": 25263 + }, + { + "epoch": 0.25264, + "grad_norm": 0.8111044497591182, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 25264 + }, + { + "epoch": 0.25265, + "grad_norm": 0.7271217320257766, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 25265 + }, + { + "epoch": 0.25266, + "grad_norm": 0.6665717487847707, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 25266 + }, + { + "epoch": 0.25267, + "grad_norm": 0.5663656116799429, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 25267 + }, + { + "epoch": 0.25268, + "grad_norm": 0.5214214111924499, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 25268 + }, + { + "epoch": 0.25269, + "grad_norm": 0.6292278456194152, + "learning_rate": 0.003, + "loss": 4.002, + "step": 25269 + }, + { + "epoch": 0.2527, + "grad_norm": 0.7029327778166253, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 25270 + }, + { + "epoch": 0.25271, + "grad_norm": 0.8759019153673309, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 25271 + }, + { + "epoch": 0.25272, + "grad_norm": 0.8955640173246128, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 25272 + }, + { + "epoch": 0.25273, + "grad_norm": 0.7466769901288722, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 25273 + }, + { + "epoch": 0.25274, + "grad_norm": 0.6332287408372114, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 25274 + }, + { + "epoch": 0.25275, + "grad_norm": 0.6729359609891398, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 25275 + }, + { + "epoch": 0.25276, + "grad_norm": 0.736250896404633, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 25276 + }, + { + "epoch": 0.25277, + "grad_norm": 0.7645156763425622, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 25277 + }, + { + "epoch": 0.25278, + "grad_norm": 0.9239753453357612, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 25278 + }, + { + "epoch": 0.25279, + "grad_norm": 1.0369218580532171, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 25279 + }, + { + "epoch": 0.2528, + "grad_norm": 0.960665030868886, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 25280 + }, + { + "epoch": 0.25281, + "grad_norm": 1.1587882851744427, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 25281 + }, + { + "epoch": 0.25282, + "grad_norm": 0.9250410365536718, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 25282 + }, + { + "epoch": 0.25283, + "grad_norm": 0.9849085885811593, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 25283 + }, + { + "epoch": 0.25284, + "grad_norm": 1.0664066574748838, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 25284 + }, + { + "epoch": 0.25285, + "grad_norm": 0.8908381876198105, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 25285 + }, + { + "epoch": 0.25286, + "grad_norm": 1.000445410786719, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 25286 + }, + { + "epoch": 0.25287, + "grad_norm": 1.0783585706918735, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 25287 + }, + { + "epoch": 0.25288, + "grad_norm": 0.9430737984026302, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 25288 + }, + { + "epoch": 0.25289, + "grad_norm": 0.9242728937612926, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 25289 + }, + { + "epoch": 0.2529, + "grad_norm": 0.9618947346540768, + "learning_rate": 0.003, + "loss": 4.041, + "step": 25290 + }, + { + "epoch": 0.25291, + "grad_norm": 0.9284490082835613, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 25291 + }, + { + "epoch": 0.25292, + "grad_norm": 0.8769662225821231, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 25292 + }, + { + "epoch": 0.25293, + "grad_norm": 0.9586373807822492, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 25293 + }, + { + "epoch": 0.25294, + "grad_norm": 1.069876489815011, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 25294 + }, + { + "epoch": 0.25295, + "grad_norm": 0.9192923928513109, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 25295 + }, + { + "epoch": 0.25296, + "grad_norm": 1.0978734419387173, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 25296 + }, + { + "epoch": 0.25297, + "grad_norm": 0.976272785047859, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 25297 + }, + { + "epoch": 0.25298, + "grad_norm": 0.8816945945241275, + "learning_rate": 0.003, + "loss": 4.066, + "step": 25298 + }, + { + "epoch": 0.25299, + "grad_norm": 0.7207166363875429, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 25299 + }, + { + "epoch": 0.253, + "grad_norm": 0.7201205185320428, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 25300 + }, + { + "epoch": 0.25301, + "grad_norm": 0.8543453280327397, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 25301 + }, + { + "epoch": 0.25302, + "grad_norm": 1.0716620064262306, + "learning_rate": 0.003, + "loss": 4.046, + "step": 25302 + }, + { + "epoch": 0.25303, + "grad_norm": 1.0881797746825694, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 25303 + }, + { + "epoch": 0.25304, + "grad_norm": 1.016852006375203, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 25304 + }, + { + "epoch": 0.25305, + "grad_norm": 0.9554927945062803, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 25305 + }, + { + "epoch": 0.25306, + "grad_norm": 0.9277551869535792, + "learning_rate": 0.003, + "loss": 4.049, + "step": 25306 + }, + { + "epoch": 0.25307, + "grad_norm": 0.9986351820325645, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 25307 + }, + { + "epoch": 0.25308, + "grad_norm": 1.0005087099042784, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 25308 + }, + { + "epoch": 0.25309, + "grad_norm": 0.9566213582654867, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 25309 + }, + { + "epoch": 0.2531, + "grad_norm": 0.9462389729872062, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 25310 + }, + { + "epoch": 0.25311, + "grad_norm": 0.9786778133935703, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 25311 + }, + { + "epoch": 0.25312, + "grad_norm": 0.9349738469532464, + "learning_rate": 0.003, + "loss": 4.074, + "step": 25312 + }, + { + "epoch": 0.25313, + "grad_norm": 0.9532965691201983, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 25313 + }, + { + "epoch": 0.25314, + "grad_norm": 0.9982918258218613, + "learning_rate": 0.003, + "loss": 4.03, + "step": 25314 + }, + { + "epoch": 0.25315, + "grad_norm": 0.8885490950199534, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 25315 + }, + { + "epoch": 0.25316, + "grad_norm": 0.7721583075658163, + "learning_rate": 0.003, + "loss": 4.023, + "step": 25316 + }, + { + "epoch": 0.25317, + "grad_norm": 0.7743708964220125, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 25317 + }, + { + "epoch": 0.25318, + "grad_norm": 0.7591344200289556, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 25318 + }, + { + "epoch": 0.25319, + "grad_norm": 0.727765352689977, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 25319 + }, + { + "epoch": 0.2532, + "grad_norm": 0.7709962453743868, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 25320 + }, + { + "epoch": 0.25321, + "grad_norm": 0.705722334589492, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 25321 + }, + { + "epoch": 0.25322, + "grad_norm": 0.7525885466576987, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 25322 + }, + { + "epoch": 0.25323, + "grad_norm": 0.8126192126234123, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 25323 + }, + { + "epoch": 0.25324, + "grad_norm": 0.8560447124533925, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 25324 + }, + { + "epoch": 0.25325, + "grad_norm": 0.9334315537878937, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 25325 + }, + { + "epoch": 0.25326, + "grad_norm": 1.0088873037776613, + "learning_rate": 0.003, + "loss": 4.056, + "step": 25326 + }, + { + "epoch": 0.25327, + "grad_norm": 1.1520395561855754, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 25327 + }, + { + "epoch": 0.25328, + "grad_norm": 0.8636976251836621, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 25328 + }, + { + "epoch": 0.25329, + "grad_norm": 0.8233150442572819, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 25329 + }, + { + "epoch": 0.2533, + "grad_norm": 0.740016900916187, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 25330 + }, + { + "epoch": 0.25331, + "grad_norm": 0.6811805208570688, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 25331 + }, + { + "epoch": 0.25332, + "grad_norm": 0.662662156005869, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 25332 + }, + { + "epoch": 0.25333, + "grad_norm": 0.7645817960044332, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 25333 + }, + { + "epoch": 0.25334, + "grad_norm": 1.0647025891409587, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 25334 + }, + { + "epoch": 0.25335, + "grad_norm": 1.1499368124227125, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 25335 + }, + { + "epoch": 0.25336, + "grad_norm": 0.9691997843489087, + "learning_rate": 0.003, + "loss": 4.074, + "step": 25336 + }, + { + "epoch": 0.25337, + "grad_norm": 0.9956269773790227, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 25337 + }, + { + "epoch": 0.25338, + "grad_norm": 1.0395578016260243, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 25338 + }, + { + "epoch": 0.25339, + "grad_norm": 1.1080047341468648, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 25339 + }, + { + "epoch": 0.2534, + "grad_norm": 1.0891566670494628, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 25340 + }, + { + "epoch": 0.25341, + "grad_norm": 0.9992001174644397, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 25341 + }, + { + "epoch": 0.25342, + "grad_norm": 0.9654661123101281, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 25342 + }, + { + "epoch": 0.25343, + "grad_norm": 0.9812674394225781, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 25343 + }, + { + "epoch": 0.25344, + "grad_norm": 0.9329278657193633, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 25344 + }, + { + "epoch": 0.25345, + "grad_norm": 0.9958000555101592, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 25345 + }, + { + "epoch": 0.25346, + "grad_norm": 1.0684253808163764, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 25346 + }, + { + "epoch": 0.25347, + "grad_norm": 0.9881712765715379, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 25347 + }, + { + "epoch": 0.25348, + "grad_norm": 0.9211958370620181, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 25348 + }, + { + "epoch": 0.25349, + "grad_norm": 0.800051863518404, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 25349 + }, + { + "epoch": 0.2535, + "grad_norm": 0.6863668037561003, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 25350 + }, + { + "epoch": 0.25351, + "grad_norm": 0.6540307341820532, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 25351 + }, + { + "epoch": 0.25352, + "grad_norm": 0.6795148378455266, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 25352 + }, + { + "epoch": 0.25353, + "grad_norm": 0.8136563327333226, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 25353 + }, + { + "epoch": 0.25354, + "grad_norm": 0.997991280001786, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 25354 + }, + { + "epoch": 0.25355, + "grad_norm": 1.1504047054129567, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 25355 + }, + { + "epoch": 0.25356, + "grad_norm": 0.8354597398779309, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 25356 + }, + { + "epoch": 0.25357, + "grad_norm": 0.7497340470905738, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 25357 + }, + { + "epoch": 0.25358, + "grad_norm": 0.7779159811438947, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 25358 + }, + { + "epoch": 0.25359, + "grad_norm": 0.7357366437028393, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 25359 + }, + { + "epoch": 0.2536, + "grad_norm": 0.8257347824140242, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 25360 + }, + { + "epoch": 0.25361, + "grad_norm": 0.8791147133718787, + "learning_rate": 0.003, + "loss": 4.075, + "step": 25361 + }, + { + "epoch": 0.25362, + "grad_norm": 0.9820244568066587, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 25362 + }, + { + "epoch": 0.25363, + "grad_norm": 1.008120974601097, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 25363 + }, + { + "epoch": 0.25364, + "grad_norm": 0.9968645577938047, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 25364 + }, + { + "epoch": 0.25365, + "grad_norm": 1.3774432360776234, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 25365 + }, + { + "epoch": 0.25366, + "grad_norm": 0.7743160983542406, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 25366 + }, + { + "epoch": 0.25367, + "grad_norm": 0.6521957190836891, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 25367 + }, + { + "epoch": 0.25368, + "grad_norm": 0.6310637605850519, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 25368 + }, + { + "epoch": 0.25369, + "grad_norm": 0.6617832700730458, + "learning_rate": 0.003, + "loss": 4.043, + "step": 25369 + }, + { + "epoch": 0.2537, + "grad_norm": 0.7164069115245838, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 25370 + }, + { + "epoch": 0.25371, + "grad_norm": 0.7580742664238952, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 25371 + }, + { + "epoch": 0.25372, + "grad_norm": 0.7807874231529279, + "learning_rate": 0.003, + "loss": 4.033, + "step": 25372 + }, + { + "epoch": 0.25373, + "grad_norm": 0.6564404852899143, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 25373 + }, + { + "epoch": 0.25374, + "grad_norm": 0.5935282523274951, + "learning_rate": 0.003, + "loss": 4.028, + "step": 25374 + }, + { + "epoch": 0.25375, + "grad_norm": 0.5296634657853462, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 25375 + }, + { + "epoch": 0.25376, + "grad_norm": 0.4181778099612975, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 25376 + }, + { + "epoch": 0.25377, + "grad_norm": 0.47798468276113126, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 25377 + }, + { + "epoch": 0.25378, + "grad_norm": 0.596500985522097, + "learning_rate": 0.003, + "loss": 3.9937, + "step": 25378 + }, + { + "epoch": 0.25379, + "grad_norm": 0.7719746235945136, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 25379 + }, + { + "epoch": 0.2538, + "grad_norm": 0.9590825149948662, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 25380 + }, + { + "epoch": 0.25381, + "grad_norm": 1.1441179454035353, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 25381 + }, + { + "epoch": 0.25382, + "grad_norm": 0.851145891503752, + "learning_rate": 0.003, + "loss": 4.059, + "step": 25382 + }, + { + "epoch": 0.25383, + "grad_norm": 0.9736854894282182, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 25383 + }, + { + "epoch": 0.25384, + "grad_norm": 1.08488241572489, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 25384 + }, + { + "epoch": 0.25385, + "grad_norm": 1.08421579681445, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 25385 + }, + { + "epoch": 0.25386, + "grad_norm": 0.960931951607441, + "learning_rate": 0.003, + "loss": 4.032, + "step": 25386 + }, + { + "epoch": 0.25387, + "grad_norm": 0.8716795048806129, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 25387 + }, + { + "epoch": 0.25388, + "grad_norm": 0.743683455320296, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 25388 + }, + { + "epoch": 0.25389, + "grad_norm": 0.7409421438781537, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 25389 + }, + { + "epoch": 0.2539, + "grad_norm": 0.9583556615793433, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 25390 + }, + { + "epoch": 0.25391, + "grad_norm": 1.2822763993639898, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 25391 + }, + { + "epoch": 0.25392, + "grad_norm": 0.7852785808092915, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 25392 + }, + { + "epoch": 0.25393, + "grad_norm": 0.8054222509597576, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 25393 + }, + { + "epoch": 0.25394, + "grad_norm": 0.8448170134102573, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 25394 + }, + { + "epoch": 0.25395, + "grad_norm": 0.9647944876800983, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 25395 + }, + { + "epoch": 0.25396, + "grad_norm": 1.008748791335382, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 25396 + }, + { + "epoch": 0.25397, + "grad_norm": 1.0277717931923147, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 25397 + }, + { + "epoch": 0.25398, + "grad_norm": 0.9367218610733006, + "learning_rate": 0.003, + "loss": 4.024, + "step": 25398 + }, + { + "epoch": 0.25399, + "grad_norm": 0.8636153444094693, + "learning_rate": 0.003, + "loss": 4.038, + "step": 25399 + }, + { + "epoch": 0.254, + "grad_norm": 0.8795463288207355, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 25400 + }, + { + "epoch": 0.25401, + "grad_norm": 1.038900095744999, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 25401 + }, + { + "epoch": 0.25402, + "grad_norm": 1.153109891819328, + "learning_rate": 0.003, + "loss": 4.04, + "step": 25402 + }, + { + "epoch": 0.25403, + "grad_norm": 0.7834441092563924, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 25403 + }, + { + "epoch": 0.25404, + "grad_norm": 0.5804236645021761, + "learning_rate": 0.003, + "loss": 4.054, + "step": 25404 + }, + { + "epoch": 0.25405, + "grad_norm": 0.6443431267252544, + "learning_rate": 0.003, + "loss": 4.064, + "step": 25405 + }, + { + "epoch": 0.25406, + "grad_norm": 0.6917601310589838, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 25406 + }, + { + "epoch": 0.25407, + "grad_norm": 0.7705419030514664, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 25407 + }, + { + "epoch": 0.25408, + "grad_norm": 0.844362485479889, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 25408 + }, + { + "epoch": 0.25409, + "grad_norm": 0.8736617083147991, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 25409 + }, + { + "epoch": 0.2541, + "grad_norm": 0.9456108059698425, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 25410 + }, + { + "epoch": 0.25411, + "grad_norm": 1.0031540315671683, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 25411 + }, + { + "epoch": 0.25412, + "grad_norm": 0.9115030835895792, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 25412 + }, + { + "epoch": 0.25413, + "grad_norm": 0.7643016281522143, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 25413 + }, + { + "epoch": 0.25414, + "grad_norm": 0.7753846567339371, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 25414 + }, + { + "epoch": 0.25415, + "grad_norm": 0.816961213431605, + "learning_rate": 0.003, + "loss": 4.03, + "step": 25415 + }, + { + "epoch": 0.25416, + "grad_norm": 0.7984824395942095, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 25416 + }, + { + "epoch": 0.25417, + "grad_norm": 0.9236465661385209, + "learning_rate": 0.003, + "loss": 4.046, + "step": 25417 + }, + { + "epoch": 0.25418, + "grad_norm": 0.9805961052156241, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 25418 + }, + { + "epoch": 0.25419, + "grad_norm": 1.0609700217988367, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 25419 + }, + { + "epoch": 0.2542, + "grad_norm": 1.014883826356174, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 25420 + }, + { + "epoch": 0.25421, + "grad_norm": 0.9724732555754642, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 25421 + }, + { + "epoch": 0.25422, + "grad_norm": 1.0059934252737834, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 25422 + }, + { + "epoch": 0.25423, + "grad_norm": 0.998373456982844, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 25423 + }, + { + "epoch": 0.25424, + "grad_norm": 1.0188897861571056, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 25424 + }, + { + "epoch": 0.25425, + "grad_norm": 1.0102722734940304, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 25425 + }, + { + "epoch": 0.25426, + "grad_norm": 0.9847216496965288, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 25426 + }, + { + "epoch": 0.25427, + "grad_norm": 0.8519119893093335, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 25427 + }, + { + "epoch": 0.25428, + "grad_norm": 0.6378448683837147, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 25428 + }, + { + "epoch": 0.25429, + "grad_norm": 0.5954905714975156, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 25429 + }, + { + "epoch": 0.2543, + "grad_norm": 0.6600043923253341, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 25430 + }, + { + "epoch": 0.25431, + "grad_norm": 0.7272253830209318, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 25431 + }, + { + "epoch": 0.25432, + "grad_norm": 0.7771316581669774, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 25432 + }, + { + "epoch": 0.25433, + "grad_norm": 0.8608374103288706, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 25433 + }, + { + "epoch": 0.25434, + "grad_norm": 0.9706761600012382, + "learning_rate": 0.003, + "loss": 4.065, + "step": 25434 + }, + { + "epoch": 0.25435, + "grad_norm": 1.3222323352698893, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 25435 + }, + { + "epoch": 0.25436, + "grad_norm": 0.795646782107252, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 25436 + }, + { + "epoch": 0.25437, + "grad_norm": 0.7422078669450148, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 25437 + }, + { + "epoch": 0.25438, + "grad_norm": 0.7205753569530813, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 25438 + }, + { + "epoch": 0.25439, + "grad_norm": 0.8374703056208461, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 25439 + }, + { + "epoch": 0.2544, + "grad_norm": 0.9473015924535735, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 25440 + }, + { + "epoch": 0.25441, + "grad_norm": 0.9610956779741119, + "learning_rate": 0.003, + "loss": 4.026, + "step": 25441 + }, + { + "epoch": 0.25442, + "grad_norm": 1.0998434799535048, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 25442 + }, + { + "epoch": 0.25443, + "grad_norm": 0.9517483964201439, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 25443 + }, + { + "epoch": 0.25444, + "grad_norm": 1.0881923435275649, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 25444 + }, + { + "epoch": 0.25445, + "grad_norm": 0.9854540428577792, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 25445 + }, + { + "epoch": 0.25446, + "grad_norm": 0.9937334275077021, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 25446 + }, + { + "epoch": 0.25447, + "grad_norm": 0.8991009518576873, + "learning_rate": 0.003, + "loss": 4.088, + "step": 25447 + }, + { + "epoch": 0.25448, + "grad_norm": 0.7835774811713323, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 25448 + }, + { + "epoch": 0.25449, + "grad_norm": 0.7177785680100887, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 25449 + }, + { + "epoch": 0.2545, + "grad_norm": 0.7240991220140802, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 25450 + }, + { + "epoch": 0.25451, + "grad_norm": 0.8143129993012853, + "learning_rate": 0.003, + "loss": 4.041, + "step": 25451 + }, + { + "epoch": 0.25452, + "grad_norm": 0.932923088842068, + "learning_rate": 0.003, + "loss": 4.028, + "step": 25452 + }, + { + "epoch": 0.25453, + "grad_norm": 1.0511225170303924, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 25453 + }, + { + "epoch": 0.25454, + "grad_norm": 0.9486055805646416, + "learning_rate": 0.003, + "loss": 4.06, + "step": 25454 + }, + { + "epoch": 0.25455, + "grad_norm": 0.8992113606784659, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 25455 + }, + { + "epoch": 0.25456, + "grad_norm": 0.8141542958239314, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 25456 + }, + { + "epoch": 0.25457, + "grad_norm": 0.8003685390606392, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 25457 + }, + { + "epoch": 0.25458, + "grad_norm": 0.7702494461996949, + "learning_rate": 0.003, + "loss": 4.056, + "step": 25458 + }, + { + "epoch": 0.25459, + "grad_norm": 0.7926421691522504, + "learning_rate": 0.003, + "loss": 4.03, + "step": 25459 + }, + { + "epoch": 0.2546, + "grad_norm": 0.9799385857050648, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 25460 + }, + { + "epoch": 0.25461, + "grad_norm": 0.9615310596977051, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 25461 + }, + { + "epoch": 0.25462, + "grad_norm": 0.9028601664226293, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 25462 + }, + { + "epoch": 0.25463, + "grad_norm": 0.9020841310022902, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 25463 + }, + { + "epoch": 0.25464, + "grad_norm": 0.861457753056404, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 25464 + }, + { + "epoch": 0.25465, + "grad_norm": 0.8009973446596149, + "learning_rate": 0.003, + "loss": 4.047, + "step": 25465 + }, + { + "epoch": 0.25466, + "grad_norm": 0.858171763018721, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 25466 + }, + { + "epoch": 0.25467, + "grad_norm": 0.9410375851232814, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 25467 + }, + { + "epoch": 0.25468, + "grad_norm": 0.8383089541302179, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 25468 + }, + { + "epoch": 0.25469, + "grad_norm": 0.8337644242039746, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 25469 + }, + { + "epoch": 0.2547, + "grad_norm": 0.8698234142245929, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 25470 + }, + { + "epoch": 0.25471, + "grad_norm": 0.9416410545568833, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 25471 + }, + { + "epoch": 0.25472, + "grad_norm": 0.9582760517410952, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 25472 + }, + { + "epoch": 0.25473, + "grad_norm": 0.9543998899328856, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 25473 + }, + { + "epoch": 0.25474, + "grad_norm": 0.890724598577648, + "learning_rate": 0.003, + "loss": 4.043, + "step": 25474 + }, + { + "epoch": 0.25475, + "grad_norm": 0.9239564384184911, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 25475 + }, + { + "epoch": 0.25476, + "grad_norm": 0.9457922367645536, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 25476 + }, + { + "epoch": 0.25477, + "grad_norm": 0.8573318977003976, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 25477 + }, + { + "epoch": 0.25478, + "grad_norm": 0.895295965815012, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 25478 + }, + { + "epoch": 0.25479, + "grad_norm": 0.9168892873538358, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 25479 + }, + { + "epoch": 0.2548, + "grad_norm": 0.8741428763811664, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 25480 + }, + { + "epoch": 0.25481, + "grad_norm": 0.9105460421050953, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 25481 + }, + { + "epoch": 0.25482, + "grad_norm": 0.9370675321003953, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 25482 + }, + { + "epoch": 0.25483, + "grad_norm": 1.0718822402509025, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 25483 + }, + { + "epoch": 0.25484, + "grad_norm": 1.0001632500340663, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 25484 + }, + { + "epoch": 0.25485, + "grad_norm": 1.1392574341173547, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 25485 + }, + { + "epoch": 0.25486, + "grad_norm": 1.013848565441649, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 25486 + }, + { + "epoch": 0.25487, + "grad_norm": 0.9946501617885597, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 25487 + }, + { + "epoch": 0.25488, + "grad_norm": 0.8692316044232674, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 25488 + }, + { + "epoch": 0.25489, + "grad_norm": 0.8405934452494283, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 25489 + }, + { + "epoch": 0.2549, + "grad_norm": 0.8184790010403689, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 25490 + }, + { + "epoch": 0.25491, + "grad_norm": 0.7197990368625105, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 25491 + }, + { + "epoch": 0.25492, + "grad_norm": 0.782838330427358, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 25492 + }, + { + "epoch": 0.25493, + "grad_norm": 0.7672056907291688, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 25493 + }, + { + "epoch": 0.25494, + "grad_norm": 0.7505411870142598, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 25494 + }, + { + "epoch": 0.25495, + "grad_norm": 0.8081563798574534, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 25495 + }, + { + "epoch": 0.25496, + "grad_norm": 1.0025012764633239, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 25496 + }, + { + "epoch": 0.25497, + "grad_norm": 1.1301600497198363, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 25497 + }, + { + "epoch": 0.25498, + "grad_norm": 0.7981922005489023, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 25498 + }, + { + "epoch": 0.25499, + "grad_norm": 0.9103924489681806, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 25499 + }, + { + "epoch": 0.255, + "grad_norm": 0.9494091859031275, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 25500 + }, + { + "epoch": 0.25501, + "grad_norm": 1.0208856096818113, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 25501 + }, + { + "epoch": 0.25502, + "grad_norm": 1.3111924181603687, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 25502 + }, + { + "epoch": 0.25503, + "grad_norm": 0.7788057062135986, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 25503 + }, + { + "epoch": 0.25504, + "grad_norm": 0.7520958041327547, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 25504 + }, + { + "epoch": 0.25505, + "grad_norm": 0.7187610784459232, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 25505 + }, + { + "epoch": 0.25506, + "grad_norm": 0.5748661321152742, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 25506 + }, + { + "epoch": 0.25507, + "grad_norm": 0.5997867976081815, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 25507 + }, + { + "epoch": 0.25508, + "grad_norm": 0.5955428365180547, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 25508 + }, + { + "epoch": 0.25509, + "grad_norm": 0.8084719075610944, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 25509 + }, + { + "epoch": 0.2551, + "grad_norm": 1.0142895679559947, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 25510 + }, + { + "epoch": 0.25511, + "grad_norm": 1.2282173894268338, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 25511 + }, + { + "epoch": 0.25512, + "grad_norm": 0.683460215566792, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 25512 + }, + { + "epoch": 0.25513, + "grad_norm": 0.6797596531837293, + "learning_rate": 0.003, + "loss": 4.04, + "step": 25513 + }, + { + "epoch": 0.25514, + "grad_norm": 0.6641751647633197, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 25514 + }, + { + "epoch": 0.25515, + "grad_norm": 0.657087574467802, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 25515 + }, + { + "epoch": 0.25516, + "grad_norm": 0.6702335307387821, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 25516 + }, + { + "epoch": 0.25517, + "grad_norm": 0.6197197046005322, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 25517 + }, + { + "epoch": 0.25518, + "grad_norm": 0.7558327995708012, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 25518 + }, + { + "epoch": 0.25519, + "grad_norm": 0.8089473984054202, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 25519 + }, + { + "epoch": 0.2552, + "grad_norm": 0.7265745355368336, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 25520 + }, + { + "epoch": 0.25521, + "grad_norm": 0.5823227255198964, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 25521 + }, + { + "epoch": 0.25522, + "grad_norm": 0.6230522270827914, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 25522 + }, + { + "epoch": 0.25523, + "grad_norm": 0.5992914012060027, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 25523 + }, + { + "epoch": 0.25524, + "grad_norm": 0.742508757382326, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 25524 + }, + { + "epoch": 0.25525, + "grad_norm": 1.0403867381216383, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 25525 + }, + { + "epoch": 0.25526, + "grad_norm": 1.4542059590245984, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 25526 + }, + { + "epoch": 0.25527, + "grad_norm": 0.7504983488349494, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 25527 + }, + { + "epoch": 0.25528, + "grad_norm": 0.7723766434979357, + "learning_rate": 0.003, + "loss": 4.024, + "step": 25528 + }, + { + "epoch": 0.25529, + "grad_norm": 0.9118160035062748, + "learning_rate": 0.003, + "loss": 4.038, + "step": 25529 + }, + { + "epoch": 0.2553, + "grad_norm": 0.9095808486226669, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 25530 + }, + { + "epoch": 0.25531, + "grad_norm": 0.8449916001815673, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 25531 + }, + { + "epoch": 0.25532, + "grad_norm": 0.8702440244971412, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 25532 + }, + { + "epoch": 0.25533, + "grad_norm": 0.9794938453352677, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 25533 + }, + { + "epoch": 0.25534, + "grad_norm": 1.2167464027111463, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 25534 + }, + { + "epoch": 0.25535, + "grad_norm": 0.8682439736283258, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 25535 + }, + { + "epoch": 0.25536, + "grad_norm": 0.7341110227597509, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 25536 + }, + { + "epoch": 0.25537, + "grad_norm": 0.780687291149449, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 25537 + }, + { + "epoch": 0.25538, + "grad_norm": 0.8103130226149866, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 25538 + }, + { + "epoch": 0.25539, + "grad_norm": 0.801259339981181, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 25539 + }, + { + "epoch": 0.2554, + "grad_norm": 0.8013416542998175, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 25540 + }, + { + "epoch": 0.25541, + "grad_norm": 1.0429397349442209, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 25541 + }, + { + "epoch": 0.25542, + "grad_norm": 1.1249394254592362, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 25542 + }, + { + "epoch": 0.25543, + "grad_norm": 1.0430346449018129, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 25543 + }, + { + "epoch": 0.25544, + "grad_norm": 1.0034310086022964, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 25544 + }, + { + "epoch": 0.25545, + "grad_norm": 0.9712440289340865, + "learning_rate": 0.003, + "loss": 4.052, + "step": 25545 + }, + { + "epoch": 0.25546, + "grad_norm": 1.213194252001686, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 25546 + }, + { + "epoch": 0.25547, + "grad_norm": 0.9168267254222705, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 25547 + }, + { + "epoch": 0.25548, + "grad_norm": 0.9423992685584162, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 25548 + }, + { + "epoch": 0.25549, + "grad_norm": 1.0913704277648193, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 25549 + }, + { + "epoch": 0.2555, + "grad_norm": 0.8593106401338043, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 25550 + }, + { + "epoch": 0.25551, + "grad_norm": 0.8969179599582954, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 25551 + }, + { + "epoch": 0.25552, + "grad_norm": 0.9422136983288252, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 25552 + }, + { + "epoch": 0.25553, + "grad_norm": 1.2035228954571051, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 25553 + }, + { + "epoch": 0.25554, + "grad_norm": 0.8811183019321478, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 25554 + }, + { + "epoch": 0.25555, + "grad_norm": 0.8789305952076087, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 25555 + }, + { + "epoch": 0.25556, + "grad_norm": 0.8863340009715519, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 25556 + }, + { + "epoch": 0.25557, + "grad_norm": 0.9700979930725055, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 25557 + }, + { + "epoch": 0.25558, + "grad_norm": 1.0179645129320452, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 25558 + }, + { + "epoch": 0.25559, + "grad_norm": 1.0769272307893212, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 25559 + }, + { + "epoch": 0.2556, + "grad_norm": 0.9473458840685861, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 25560 + }, + { + "epoch": 0.25561, + "grad_norm": 0.9506894946488, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 25561 + }, + { + "epoch": 0.25562, + "grad_norm": 1.008624553357757, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 25562 + }, + { + "epoch": 0.25563, + "grad_norm": 1.015411260879144, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 25563 + }, + { + "epoch": 0.25564, + "grad_norm": 0.9091591793628141, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 25564 + }, + { + "epoch": 0.25565, + "grad_norm": 0.7957076530872577, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 25565 + }, + { + "epoch": 0.25566, + "grad_norm": 0.8147177574731048, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 25566 + }, + { + "epoch": 0.25567, + "grad_norm": 0.890155925257158, + "learning_rate": 0.003, + "loss": 4.077, + "step": 25567 + }, + { + "epoch": 0.25568, + "grad_norm": 0.9524484198845756, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 25568 + }, + { + "epoch": 0.25569, + "grad_norm": 0.9976570676585215, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 25569 + }, + { + "epoch": 0.2557, + "grad_norm": 0.9443232980798448, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 25570 + }, + { + "epoch": 0.25571, + "grad_norm": 0.8625075738759506, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 25571 + }, + { + "epoch": 0.25572, + "grad_norm": 0.7747316821490035, + "learning_rate": 0.003, + "loss": 4.038, + "step": 25572 + }, + { + "epoch": 0.25573, + "grad_norm": 0.6705995262255924, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 25573 + }, + { + "epoch": 0.25574, + "grad_norm": 0.7759838653009686, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 25574 + }, + { + "epoch": 0.25575, + "grad_norm": 0.6786614748639236, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 25575 + }, + { + "epoch": 0.25576, + "grad_norm": 0.6569226479781927, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 25576 + }, + { + "epoch": 0.25577, + "grad_norm": 0.6123073315146863, + "learning_rate": 0.003, + "loss": 3.9865, + "step": 25577 + }, + { + "epoch": 0.25578, + "grad_norm": 0.555942781431035, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 25578 + }, + { + "epoch": 0.25579, + "grad_norm": 0.508188277926097, + "learning_rate": 0.003, + "loss": 4.058, + "step": 25579 + }, + { + "epoch": 0.2558, + "grad_norm": 0.48736665887242026, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 25580 + }, + { + "epoch": 0.25581, + "grad_norm": 0.5591794872385124, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 25581 + }, + { + "epoch": 0.25582, + "grad_norm": 0.6799994399804233, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 25582 + }, + { + "epoch": 0.25583, + "grad_norm": 0.7925941771204883, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 25583 + }, + { + "epoch": 0.25584, + "grad_norm": 1.0116346878747515, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 25584 + }, + { + "epoch": 0.25585, + "grad_norm": 1.2898610556182433, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 25585 + }, + { + "epoch": 0.25586, + "grad_norm": 0.6588163015969251, + "learning_rate": 0.003, + "loss": 4.0077, + "step": 25586 + }, + { + "epoch": 0.25587, + "grad_norm": 0.6672315270036102, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 25587 + }, + { + "epoch": 0.25588, + "grad_norm": 0.700809253249972, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 25588 + }, + { + "epoch": 0.25589, + "grad_norm": 0.6146810028191083, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 25589 + }, + { + "epoch": 0.2559, + "grad_norm": 0.6556997210054634, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 25590 + }, + { + "epoch": 0.25591, + "grad_norm": 0.8232126219912062, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 25591 + }, + { + "epoch": 0.25592, + "grad_norm": 0.8550132623495436, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 25592 + }, + { + "epoch": 0.25593, + "grad_norm": 0.9510529383893469, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 25593 + }, + { + "epoch": 0.25594, + "grad_norm": 1.4121719430186077, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 25594 + }, + { + "epoch": 0.25595, + "grad_norm": 0.7312173130655611, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 25595 + }, + { + "epoch": 0.25596, + "grad_norm": 0.7739422926134939, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 25596 + }, + { + "epoch": 0.25597, + "grad_norm": 0.9058690802755301, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 25597 + }, + { + "epoch": 0.25598, + "grad_norm": 0.8462997574632294, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 25598 + }, + { + "epoch": 0.25599, + "grad_norm": 0.909678267602146, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 25599 + }, + { + "epoch": 0.256, + "grad_norm": 0.8912180986731276, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 25600 + }, + { + "epoch": 0.25601, + "grad_norm": 0.8666255804984621, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 25601 + }, + { + "epoch": 0.25602, + "grad_norm": 0.9898990004298943, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 25602 + }, + { + "epoch": 0.25603, + "grad_norm": 1.3179788081266002, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 25603 + }, + { + "epoch": 0.25604, + "grad_norm": 0.8472022132025522, + "learning_rate": 0.003, + "loss": 4.052, + "step": 25604 + }, + { + "epoch": 0.25605, + "grad_norm": 0.8768467138574475, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 25605 + }, + { + "epoch": 0.25606, + "grad_norm": 0.9307311815448447, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 25606 + }, + { + "epoch": 0.25607, + "grad_norm": 1.028048741438936, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 25607 + }, + { + "epoch": 0.25608, + "grad_norm": 0.959890155600721, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 25608 + }, + { + "epoch": 0.25609, + "grad_norm": 1.0530517493433318, + "learning_rate": 0.003, + "loss": 4.029, + "step": 25609 + }, + { + "epoch": 0.2561, + "grad_norm": 0.9781700754553776, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 25610 + }, + { + "epoch": 0.25611, + "grad_norm": 0.9615322913474433, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 25611 + }, + { + "epoch": 0.25612, + "grad_norm": 0.9705404058828069, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 25612 + }, + { + "epoch": 0.25613, + "grad_norm": 1.0440101934038097, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 25613 + }, + { + "epoch": 0.25614, + "grad_norm": 1.1459438755959968, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 25614 + }, + { + "epoch": 0.25615, + "grad_norm": 1.055613045407354, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 25615 + }, + { + "epoch": 0.25616, + "grad_norm": 0.8461364468505412, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 25616 + }, + { + "epoch": 0.25617, + "grad_norm": 0.8194803481270396, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 25617 + }, + { + "epoch": 0.25618, + "grad_norm": 0.7574136598273855, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 25618 + }, + { + "epoch": 0.25619, + "grad_norm": 0.6849649655458872, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 25619 + }, + { + "epoch": 0.2562, + "grad_norm": 0.6740934658316818, + "learning_rate": 0.003, + "loss": 4.076, + "step": 25620 + }, + { + "epoch": 0.25621, + "grad_norm": 0.7352822610255584, + "learning_rate": 0.003, + "loss": 4.068, + "step": 25621 + }, + { + "epoch": 0.25622, + "grad_norm": 0.8764602573807927, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 25622 + }, + { + "epoch": 0.25623, + "grad_norm": 0.8715898480093845, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 25623 + }, + { + "epoch": 0.25624, + "grad_norm": 0.8413433696271648, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 25624 + }, + { + "epoch": 0.25625, + "grad_norm": 0.9523683103187759, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 25625 + }, + { + "epoch": 0.25626, + "grad_norm": 1.2790198080640687, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 25626 + }, + { + "epoch": 0.25627, + "grad_norm": 0.8367658837111188, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 25627 + }, + { + "epoch": 0.25628, + "grad_norm": 0.7835435226818986, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 25628 + }, + { + "epoch": 0.25629, + "grad_norm": 0.815543660024562, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 25629 + }, + { + "epoch": 0.2563, + "grad_norm": 0.8565677365740393, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 25630 + }, + { + "epoch": 0.25631, + "grad_norm": 0.9228440531461264, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 25631 + }, + { + "epoch": 0.25632, + "grad_norm": 1.0810341151556777, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 25632 + }, + { + "epoch": 0.25633, + "grad_norm": 1.0614724139330207, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 25633 + }, + { + "epoch": 0.25634, + "grad_norm": 0.9351934832162114, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 25634 + }, + { + "epoch": 0.25635, + "grad_norm": 0.9388407753147652, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 25635 + }, + { + "epoch": 0.25636, + "grad_norm": 0.962198303489308, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 25636 + }, + { + "epoch": 0.25637, + "grad_norm": 1.0712447127686475, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 25637 + }, + { + "epoch": 0.25638, + "grad_norm": 0.9816949890034204, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 25638 + }, + { + "epoch": 0.25639, + "grad_norm": 1.0146084575011993, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 25639 + }, + { + "epoch": 0.2564, + "grad_norm": 0.9697545953193931, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 25640 + }, + { + "epoch": 0.25641, + "grad_norm": 0.904696813168741, + "learning_rate": 0.003, + "loss": 4.072, + "step": 25641 + }, + { + "epoch": 0.25642, + "grad_norm": 0.8029834457338992, + "learning_rate": 0.003, + "loss": 4.063, + "step": 25642 + }, + { + "epoch": 0.25643, + "grad_norm": 0.7341474364686016, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 25643 + }, + { + "epoch": 0.25644, + "grad_norm": 0.7317341320643208, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 25644 + }, + { + "epoch": 0.25645, + "grad_norm": 0.7290685796342252, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 25645 + }, + { + "epoch": 0.25646, + "grad_norm": 0.8389223548237573, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 25646 + }, + { + "epoch": 0.25647, + "grad_norm": 0.7323136602680181, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 25647 + }, + { + "epoch": 0.25648, + "grad_norm": 0.7006458012311327, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 25648 + }, + { + "epoch": 0.25649, + "grad_norm": 0.8796614539421426, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 25649 + }, + { + "epoch": 0.2565, + "grad_norm": 1.134289283000181, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 25650 + }, + { + "epoch": 0.25651, + "grad_norm": 1.0994671854732656, + "learning_rate": 0.003, + "loss": 4.062, + "step": 25651 + }, + { + "epoch": 0.25652, + "grad_norm": 1.0696433314732081, + "learning_rate": 0.003, + "loss": 4.026, + "step": 25652 + }, + { + "epoch": 0.25653, + "grad_norm": 0.9782450421160436, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 25653 + }, + { + "epoch": 0.25654, + "grad_norm": 0.9176937429881871, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 25654 + }, + { + "epoch": 0.25655, + "grad_norm": 1.0226859288837291, + "learning_rate": 0.003, + "loss": 4.025, + "step": 25655 + }, + { + "epoch": 0.25656, + "grad_norm": 1.1466472485479016, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 25656 + }, + { + "epoch": 0.25657, + "grad_norm": 0.8646470179603246, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 25657 + }, + { + "epoch": 0.25658, + "grad_norm": 0.8301829953055189, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 25658 + }, + { + "epoch": 0.25659, + "grad_norm": 0.8899418748121447, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 25659 + }, + { + "epoch": 0.2566, + "grad_norm": 0.7554487148446792, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 25660 + }, + { + "epoch": 0.25661, + "grad_norm": 0.7664215667047021, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 25661 + }, + { + "epoch": 0.25662, + "grad_norm": 0.7743180600402911, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 25662 + }, + { + "epoch": 0.25663, + "grad_norm": 0.7839466630831146, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 25663 + }, + { + "epoch": 0.25664, + "grad_norm": 0.8355764237406392, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 25664 + }, + { + "epoch": 0.25665, + "grad_norm": 0.9217610469090546, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 25665 + }, + { + "epoch": 0.25666, + "grad_norm": 1.0406837259996145, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 25666 + }, + { + "epoch": 0.25667, + "grad_norm": 1.0871064122799725, + "learning_rate": 0.003, + "loss": 4.037, + "step": 25667 + }, + { + "epoch": 0.25668, + "grad_norm": 0.8762911761659423, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 25668 + }, + { + "epoch": 0.25669, + "grad_norm": 0.7654352563074435, + "learning_rate": 0.003, + "loss": 4.069, + "step": 25669 + }, + { + "epoch": 0.2567, + "grad_norm": 0.6871249401806157, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 25670 + }, + { + "epoch": 0.25671, + "grad_norm": 0.7061119219021542, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 25671 + }, + { + "epoch": 0.25672, + "grad_norm": 0.7579857118313267, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 25672 + }, + { + "epoch": 0.25673, + "grad_norm": 0.8100201997864876, + "learning_rate": 0.003, + "loss": 4.064, + "step": 25673 + }, + { + "epoch": 0.25674, + "grad_norm": 0.9258903405609418, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 25674 + }, + { + "epoch": 0.25675, + "grad_norm": 0.986459741839688, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 25675 + }, + { + "epoch": 0.25676, + "grad_norm": 0.9253357279033798, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 25676 + }, + { + "epoch": 0.25677, + "grad_norm": 0.8626855461393848, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 25677 + }, + { + "epoch": 0.25678, + "grad_norm": 0.7917328886376643, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 25678 + }, + { + "epoch": 0.25679, + "grad_norm": 0.8560363159827711, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 25679 + }, + { + "epoch": 0.2568, + "grad_norm": 0.8596436207762913, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 25680 + }, + { + "epoch": 0.25681, + "grad_norm": 0.8531007742661948, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 25681 + }, + { + "epoch": 0.25682, + "grad_norm": 0.9904724442011617, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 25682 + }, + { + "epoch": 0.25683, + "grad_norm": 1.0273013989368105, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 25683 + }, + { + "epoch": 0.25684, + "grad_norm": 0.8995409725632936, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 25684 + }, + { + "epoch": 0.25685, + "grad_norm": 0.7661760095224859, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 25685 + }, + { + "epoch": 0.25686, + "grad_norm": 0.6275484598990031, + "learning_rate": 0.003, + "loss": 4.052, + "step": 25686 + }, + { + "epoch": 0.25687, + "grad_norm": 0.7235673533566921, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 25687 + }, + { + "epoch": 0.25688, + "grad_norm": 0.8798494700149826, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 25688 + }, + { + "epoch": 0.25689, + "grad_norm": 1.0494151545547452, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 25689 + }, + { + "epoch": 0.2569, + "grad_norm": 1.1671667975695061, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 25690 + }, + { + "epoch": 0.25691, + "grad_norm": 0.7408143499619712, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 25691 + }, + { + "epoch": 0.25692, + "grad_norm": 0.7026773157590855, + "learning_rate": 0.003, + "loss": 4.028, + "step": 25692 + }, + { + "epoch": 0.25693, + "grad_norm": 0.8980310242223455, + "learning_rate": 0.003, + "loss": 4.0004, + "step": 25693 + }, + { + "epoch": 0.25694, + "grad_norm": 1.0824228126626987, + "learning_rate": 0.003, + "loss": 4.035, + "step": 25694 + }, + { + "epoch": 0.25695, + "grad_norm": 0.924279239306092, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 25695 + }, + { + "epoch": 0.25696, + "grad_norm": 0.753752479617001, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 25696 + }, + { + "epoch": 0.25697, + "grad_norm": 0.6580218777582888, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 25697 + }, + { + "epoch": 0.25698, + "grad_norm": 0.6875077978791423, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 25698 + }, + { + "epoch": 0.25699, + "grad_norm": 0.6162836382824428, + "learning_rate": 0.003, + "loss": 4.045, + "step": 25699 + }, + { + "epoch": 0.257, + "grad_norm": 0.5708048517756945, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 25700 + }, + { + "epoch": 0.25701, + "grad_norm": 0.5898817318191806, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 25701 + }, + { + "epoch": 0.25702, + "grad_norm": 0.6010757892884534, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 25702 + }, + { + "epoch": 0.25703, + "grad_norm": 0.6421809623132237, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 25703 + }, + { + "epoch": 0.25704, + "grad_norm": 0.6782850962867656, + "learning_rate": 0.003, + "loss": 4.027, + "step": 25704 + }, + { + "epoch": 0.25705, + "grad_norm": 0.6860537944354691, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 25705 + }, + { + "epoch": 0.25706, + "grad_norm": 0.8158097591612413, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 25706 + }, + { + "epoch": 0.25707, + "grad_norm": 1.1273595771592388, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 25707 + }, + { + "epoch": 0.25708, + "grad_norm": 1.1207301596591779, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 25708 + }, + { + "epoch": 0.25709, + "grad_norm": 1.058964295915854, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 25709 + }, + { + "epoch": 0.2571, + "grad_norm": 0.9343641816461868, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 25710 + }, + { + "epoch": 0.25711, + "grad_norm": 0.9226828660493398, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 25711 + }, + { + "epoch": 0.25712, + "grad_norm": 1.0302871551029846, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 25712 + }, + { + "epoch": 0.25713, + "grad_norm": 0.9643125705497843, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 25713 + }, + { + "epoch": 0.25714, + "grad_norm": 1.0836966712936023, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 25714 + }, + { + "epoch": 0.25715, + "grad_norm": 0.9690842198080638, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 25715 + }, + { + "epoch": 0.25716, + "grad_norm": 1.2217463202640368, + "learning_rate": 0.003, + "loss": 4.056, + "step": 25716 + }, + { + "epoch": 0.25717, + "grad_norm": 1.115843673163455, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 25717 + }, + { + "epoch": 0.25718, + "grad_norm": 0.9141255348458905, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 25718 + }, + { + "epoch": 0.25719, + "grad_norm": 0.7814168289509926, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 25719 + }, + { + "epoch": 0.2572, + "grad_norm": 0.8702282404854796, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 25720 + }, + { + "epoch": 0.25721, + "grad_norm": 0.8872346559261546, + "learning_rate": 0.003, + "loss": 4.062, + "step": 25721 + }, + { + "epoch": 0.25722, + "grad_norm": 0.8800332032413002, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 25722 + }, + { + "epoch": 0.25723, + "grad_norm": 0.9926870576404875, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 25723 + }, + { + "epoch": 0.25724, + "grad_norm": 0.9740608546546412, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 25724 + }, + { + "epoch": 0.25725, + "grad_norm": 0.9598478971687701, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 25725 + }, + { + "epoch": 0.25726, + "grad_norm": 1.1782344971632273, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 25726 + }, + { + "epoch": 0.25727, + "grad_norm": 0.8986815958279982, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 25727 + }, + { + "epoch": 0.25728, + "grad_norm": 0.8982273220523551, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 25728 + }, + { + "epoch": 0.25729, + "grad_norm": 0.9391246262802219, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 25729 + }, + { + "epoch": 0.2573, + "grad_norm": 0.8725527593120934, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 25730 + }, + { + "epoch": 0.25731, + "grad_norm": 0.9242694734094682, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 25731 + }, + { + "epoch": 0.25732, + "grad_norm": 1.058437329058568, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 25732 + }, + { + "epoch": 0.25733, + "grad_norm": 1.0376223346138786, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 25733 + }, + { + "epoch": 0.25734, + "grad_norm": 0.8543840355948839, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 25734 + }, + { + "epoch": 0.25735, + "grad_norm": 0.7257909123798285, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 25735 + }, + { + "epoch": 0.25736, + "grad_norm": 0.7997216759158365, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 25736 + }, + { + "epoch": 0.25737, + "grad_norm": 0.9325336733075685, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 25737 + }, + { + "epoch": 0.25738, + "grad_norm": 0.980629589777745, + "learning_rate": 0.003, + "loss": 3.996, + "step": 25738 + }, + { + "epoch": 0.25739, + "grad_norm": 0.9973327509693599, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 25739 + }, + { + "epoch": 0.2574, + "grad_norm": 0.9908197866583291, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 25740 + }, + { + "epoch": 0.25741, + "grad_norm": 0.9549293432185175, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 25741 + }, + { + "epoch": 0.25742, + "grad_norm": 1.0225836593225683, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 25742 + }, + { + "epoch": 0.25743, + "grad_norm": 1.1774744232298078, + "learning_rate": 0.003, + "loss": 4.03, + "step": 25743 + }, + { + "epoch": 0.25744, + "grad_norm": 0.8781227743866601, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 25744 + }, + { + "epoch": 0.25745, + "grad_norm": 0.6880503915672852, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 25745 + }, + { + "epoch": 0.25746, + "grad_norm": 0.6580475514599924, + "learning_rate": 0.003, + "loss": 4.056, + "step": 25746 + }, + { + "epoch": 0.25747, + "grad_norm": 0.6756631140243022, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 25747 + }, + { + "epoch": 0.25748, + "grad_norm": 0.7968680113685896, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 25748 + }, + { + "epoch": 0.25749, + "grad_norm": 0.8426893521184446, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 25749 + }, + { + "epoch": 0.2575, + "grad_norm": 0.795421932609609, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 25750 + }, + { + "epoch": 0.25751, + "grad_norm": 0.769803752182991, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 25751 + }, + { + "epoch": 0.25752, + "grad_norm": 0.8231547235797607, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 25752 + }, + { + "epoch": 0.25753, + "grad_norm": 0.7707796307353454, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 25753 + }, + { + "epoch": 0.25754, + "grad_norm": 0.7720654428495404, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 25754 + }, + { + "epoch": 0.25755, + "grad_norm": 0.8182178589963739, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 25755 + }, + { + "epoch": 0.25756, + "grad_norm": 0.7238280484282918, + "learning_rate": 0.003, + "loss": 4.05, + "step": 25756 + }, + { + "epoch": 0.25757, + "grad_norm": 0.6772948109937411, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 25757 + }, + { + "epoch": 0.25758, + "grad_norm": 0.7678719313834955, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 25758 + }, + { + "epoch": 0.25759, + "grad_norm": 0.9344363985664037, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 25759 + }, + { + "epoch": 0.2576, + "grad_norm": 1.154656001649035, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 25760 + }, + { + "epoch": 0.25761, + "grad_norm": 0.7646643440630714, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 25761 + }, + { + "epoch": 0.25762, + "grad_norm": 0.6659631380371734, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 25762 + }, + { + "epoch": 0.25763, + "grad_norm": 0.7176069338262329, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 25763 + }, + { + "epoch": 0.25764, + "grad_norm": 0.7030459219151513, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 25764 + }, + { + "epoch": 0.25765, + "grad_norm": 0.9200535271219515, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 25765 + }, + { + "epoch": 0.25766, + "grad_norm": 1.2085399986251164, + "learning_rate": 0.003, + "loss": 4.054, + "step": 25766 + }, + { + "epoch": 0.25767, + "grad_norm": 0.9070711000996539, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 25767 + }, + { + "epoch": 0.25768, + "grad_norm": 0.9551722803549869, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 25768 + }, + { + "epoch": 0.25769, + "grad_norm": 1.0473348674472607, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 25769 + }, + { + "epoch": 0.2577, + "grad_norm": 0.9034143418257037, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 25770 + }, + { + "epoch": 0.25771, + "grad_norm": 0.9576962150044424, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 25771 + }, + { + "epoch": 0.25772, + "grad_norm": 0.8674523085836866, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 25772 + }, + { + "epoch": 0.25773, + "grad_norm": 0.8693565533491876, + "learning_rate": 0.003, + "loss": 4.045, + "step": 25773 + }, + { + "epoch": 0.25774, + "grad_norm": 0.9475099252831082, + "learning_rate": 0.003, + "loss": 4.047, + "step": 25774 + }, + { + "epoch": 0.25775, + "grad_norm": 1.0200939862649179, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 25775 + }, + { + "epoch": 0.25776, + "grad_norm": 0.9954123108957649, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 25776 + }, + { + "epoch": 0.25777, + "grad_norm": 0.897136140992167, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 25777 + }, + { + "epoch": 0.25778, + "grad_norm": 0.8759373911773831, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 25778 + }, + { + "epoch": 0.25779, + "grad_norm": 0.8813415194178811, + "learning_rate": 0.003, + "loss": 4.069, + "step": 25779 + }, + { + "epoch": 0.2578, + "grad_norm": 0.9494621897883795, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 25780 + }, + { + "epoch": 0.25781, + "grad_norm": 1.0759895692576578, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 25781 + }, + { + "epoch": 0.25782, + "grad_norm": 1.0767392451194029, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 25782 + }, + { + "epoch": 0.25783, + "grad_norm": 0.99383370567182, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 25783 + }, + { + "epoch": 0.25784, + "grad_norm": 1.103720313872647, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 25784 + }, + { + "epoch": 0.25785, + "grad_norm": 1.0266387934492864, + "learning_rate": 0.003, + "loss": 4.064, + "step": 25785 + }, + { + "epoch": 0.25786, + "grad_norm": 0.925162090251453, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 25786 + }, + { + "epoch": 0.25787, + "grad_norm": 0.7942677904027319, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 25787 + }, + { + "epoch": 0.25788, + "grad_norm": 0.8909008635908302, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 25788 + }, + { + "epoch": 0.25789, + "grad_norm": 0.9280757163826667, + "learning_rate": 0.003, + "loss": 4.028, + "step": 25789 + }, + { + "epoch": 0.2579, + "grad_norm": 0.7453131310347371, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 25790 + }, + { + "epoch": 0.25791, + "grad_norm": 0.7665388665222183, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 25791 + }, + { + "epoch": 0.25792, + "grad_norm": 0.7868354289513428, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 25792 + }, + { + "epoch": 0.25793, + "grad_norm": 0.6918202009526567, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 25793 + }, + { + "epoch": 0.25794, + "grad_norm": 0.7564816172491939, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 25794 + }, + { + "epoch": 0.25795, + "grad_norm": 1.0367794617615937, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 25795 + }, + { + "epoch": 0.25796, + "grad_norm": 1.1524336035187397, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 25796 + }, + { + "epoch": 0.25797, + "grad_norm": 0.7277175467416978, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 25797 + }, + { + "epoch": 0.25798, + "grad_norm": 0.7094363324290363, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 25798 + }, + { + "epoch": 0.25799, + "grad_norm": 0.6298351131852827, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 25799 + }, + { + "epoch": 0.258, + "grad_norm": 0.6481622423664767, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 25800 + }, + { + "epoch": 0.25801, + "grad_norm": 0.5779186357375955, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 25801 + }, + { + "epoch": 0.25802, + "grad_norm": 0.5033090271479658, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 25802 + }, + { + "epoch": 0.25803, + "grad_norm": 0.5136676267862368, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 25803 + }, + { + "epoch": 0.25804, + "grad_norm": 0.5945167478390637, + "learning_rate": 0.003, + "loss": 4.04, + "step": 25804 + }, + { + "epoch": 0.25805, + "grad_norm": 0.651430175051021, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 25805 + }, + { + "epoch": 0.25806, + "grad_norm": 0.8020861174781325, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 25806 + }, + { + "epoch": 0.25807, + "grad_norm": 1.0394046100369074, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 25807 + }, + { + "epoch": 0.25808, + "grad_norm": 1.1820717808178223, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 25808 + }, + { + "epoch": 0.25809, + "grad_norm": 0.9585555198895713, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 25809 + }, + { + "epoch": 0.2581, + "grad_norm": 1.0524183857996274, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 25810 + }, + { + "epoch": 0.25811, + "grad_norm": 0.9594371755258403, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 25811 + }, + { + "epoch": 0.25812, + "grad_norm": 0.8475604564211586, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 25812 + }, + { + "epoch": 0.25813, + "grad_norm": 0.8442216772149274, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 25813 + }, + { + "epoch": 0.25814, + "grad_norm": 1.0356576458822522, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 25814 + }, + { + "epoch": 0.25815, + "grad_norm": 1.2569038573350582, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 25815 + }, + { + "epoch": 0.25816, + "grad_norm": 0.6622131297530263, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 25816 + }, + { + "epoch": 0.25817, + "grad_norm": 0.6320970930100901, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 25817 + }, + { + "epoch": 0.25818, + "grad_norm": 0.67163893254025, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 25818 + }, + { + "epoch": 0.25819, + "grad_norm": 0.6654415833982926, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 25819 + }, + { + "epoch": 0.2582, + "grad_norm": 0.8112589598221958, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 25820 + }, + { + "epoch": 0.25821, + "grad_norm": 0.9196013931368918, + "learning_rate": 0.003, + "loss": 3.9979, + "step": 25821 + }, + { + "epoch": 0.25822, + "grad_norm": 1.0425288442262701, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 25822 + }, + { + "epoch": 0.25823, + "grad_norm": 0.8425794297654894, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 25823 + }, + { + "epoch": 0.25824, + "grad_norm": 0.7968565549634317, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 25824 + }, + { + "epoch": 0.25825, + "grad_norm": 0.8450979405567284, + "learning_rate": 0.003, + "loss": 4.046, + "step": 25825 + }, + { + "epoch": 0.25826, + "grad_norm": 0.8863244259587216, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 25826 + }, + { + "epoch": 0.25827, + "grad_norm": 1.0289805153391658, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 25827 + }, + { + "epoch": 0.25828, + "grad_norm": 1.1437242005962354, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 25828 + }, + { + "epoch": 0.25829, + "grad_norm": 0.8939463470666488, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 25829 + }, + { + "epoch": 0.2583, + "grad_norm": 0.8803308972100106, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 25830 + }, + { + "epoch": 0.25831, + "grad_norm": 0.7933552530329523, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 25831 + }, + { + "epoch": 0.25832, + "grad_norm": 0.9356777367398023, + "learning_rate": 0.003, + "loss": 4.049, + "step": 25832 + }, + { + "epoch": 0.25833, + "grad_norm": 0.9516656042341312, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 25833 + }, + { + "epoch": 0.25834, + "grad_norm": 0.9193313586915596, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 25834 + }, + { + "epoch": 0.25835, + "grad_norm": 0.9634382042760945, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 25835 + }, + { + "epoch": 0.25836, + "grad_norm": 0.9335316656342327, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 25836 + }, + { + "epoch": 0.25837, + "grad_norm": 0.8970002937832366, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 25837 + }, + { + "epoch": 0.25838, + "grad_norm": 0.9901839838475006, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 25838 + }, + { + "epoch": 0.25839, + "grad_norm": 1.2037838117017072, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 25839 + }, + { + "epoch": 0.2584, + "grad_norm": 0.8802229977940653, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 25840 + }, + { + "epoch": 0.25841, + "grad_norm": 0.882073749070165, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 25841 + }, + { + "epoch": 0.25842, + "grad_norm": 1.0082580681189421, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 25842 + }, + { + "epoch": 0.25843, + "grad_norm": 0.874982647275984, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 25843 + }, + { + "epoch": 0.25844, + "grad_norm": 0.8621735796180681, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 25844 + }, + { + "epoch": 0.25845, + "grad_norm": 0.8932734465674992, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 25845 + }, + { + "epoch": 0.25846, + "grad_norm": 1.0257582556690623, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 25846 + }, + { + "epoch": 0.25847, + "grad_norm": 1.2066263872531944, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 25847 + }, + { + "epoch": 0.25848, + "grad_norm": 0.8650901957642076, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 25848 + }, + { + "epoch": 0.25849, + "grad_norm": 0.8667146177648227, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 25849 + }, + { + "epoch": 0.2585, + "grad_norm": 0.9662269336997142, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 25850 + }, + { + "epoch": 0.25851, + "grad_norm": 1.2057527719941996, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 25851 + }, + { + "epoch": 0.25852, + "grad_norm": 0.8808538520293002, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 25852 + }, + { + "epoch": 0.25853, + "grad_norm": 0.8309043526801995, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 25853 + }, + { + "epoch": 0.25854, + "grad_norm": 0.9413281953063685, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 25854 + }, + { + "epoch": 0.25855, + "grad_norm": 0.9191799449942011, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 25855 + }, + { + "epoch": 0.25856, + "grad_norm": 0.9198332894436225, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 25856 + }, + { + "epoch": 0.25857, + "grad_norm": 0.9436250340763263, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 25857 + }, + { + "epoch": 0.25858, + "grad_norm": 1.001050141894837, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 25858 + }, + { + "epoch": 0.25859, + "grad_norm": 0.9844950994709294, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 25859 + }, + { + "epoch": 0.2586, + "grad_norm": 0.8786801301635868, + "learning_rate": 0.003, + "loss": 4.055, + "step": 25860 + }, + { + "epoch": 0.25861, + "grad_norm": 0.7180851277303991, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 25861 + }, + { + "epoch": 0.25862, + "grad_norm": 0.6390804457022464, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 25862 + }, + { + "epoch": 0.25863, + "grad_norm": 0.5646052416997042, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 25863 + }, + { + "epoch": 0.25864, + "grad_norm": 0.5339932331561656, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 25864 + }, + { + "epoch": 0.25865, + "grad_norm": 0.5683587474693323, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 25865 + }, + { + "epoch": 0.25866, + "grad_norm": 0.5984531195999825, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 25866 + }, + { + "epoch": 0.25867, + "grad_norm": 0.5521101672393206, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 25867 + }, + { + "epoch": 0.25868, + "grad_norm": 0.6007304567688321, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 25868 + }, + { + "epoch": 0.25869, + "grad_norm": 0.6559460457821347, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 25869 + }, + { + "epoch": 0.2587, + "grad_norm": 0.866258462499432, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 25870 + }, + { + "epoch": 0.25871, + "grad_norm": 1.2472532541149866, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 25871 + }, + { + "epoch": 0.25872, + "grad_norm": 0.8446041709917281, + "learning_rate": 0.003, + "loss": 4.04, + "step": 25872 + }, + { + "epoch": 0.25873, + "grad_norm": 0.6451099728966582, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 25873 + }, + { + "epoch": 0.25874, + "grad_norm": 0.621428619108411, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 25874 + }, + { + "epoch": 0.25875, + "grad_norm": 0.6419477296677296, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 25875 + }, + { + "epoch": 0.25876, + "grad_norm": 0.7309942038750866, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 25876 + }, + { + "epoch": 0.25877, + "grad_norm": 0.9223989918023651, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 25877 + }, + { + "epoch": 0.25878, + "grad_norm": 1.1371450193314694, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 25878 + }, + { + "epoch": 0.25879, + "grad_norm": 0.9127424359215026, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 25879 + }, + { + "epoch": 0.2588, + "grad_norm": 0.898105141764287, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 25880 + }, + { + "epoch": 0.25881, + "grad_norm": 0.8870293237976957, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 25881 + }, + { + "epoch": 0.25882, + "grad_norm": 0.8571869344215011, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 25882 + }, + { + "epoch": 0.25883, + "grad_norm": 0.8373273506868274, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 25883 + }, + { + "epoch": 0.25884, + "grad_norm": 0.8422612089063182, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 25884 + }, + { + "epoch": 0.25885, + "grad_norm": 0.9146282554344322, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 25885 + }, + { + "epoch": 0.25886, + "grad_norm": 0.7739463592685634, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 25886 + }, + { + "epoch": 0.25887, + "grad_norm": 0.7562467325592032, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 25887 + }, + { + "epoch": 0.25888, + "grad_norm": 0.9399885995128254, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 25888 + }, + { + "epoch": 0.25889, + "grad_norm": 1.1139726599018331, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 25889 + }, + { + "epoch": 0.2589, + "grad_norm": 0.956585460755578, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 25890 + }, + { + "epoch": 0.25891, + "grad_norm": 1.048416275823911, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 25891 + }, + { + "epoch": 0.25892, + "grad_norm": 1.3350114298512115, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 25892 + }, + { + "epoch": 0.25893, + "grad_norm": 0.8264402436976467, + "learning_rate": 0.003, + "loss": 4.067, + "step": 25893 + }, + { + "epoch": 0.25894, + "grad_norm": 0.8854505957219764, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 25894 + }, + { + "epoch": 0.25895, + "grad_norm": 0.958151037817533, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 25895 + }, + { + "epoch": 0.25896, + "grad_norm": 1.068271233532324, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 25896 + }, + { + "epoch": 0.25897, + "grad_norm": 0.7958327895186796, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 25897 + }, + { + "epoch": 0.25898, + "grad_norm": 0.8441338680320615, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 25898 + }, + { + "epoch": 0.25899, + "grad_norm": 0.7808939378255455, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 25899 + }, + { + "epoch": 0.259, + "grad_norm": 0.7333410233655898, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 25900 + }, + { + "epoch": 0.25901, + "grad_norm": 0.7421629819934148, + "learning_rate": 0.003, + "loss": 4.073, + "step": 25901 + }, + { + "epoch": 0.25902, + "grad_norm": 0.769639933952084, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 25902 + }, + { + "epoch": 0.25903, + "grad_norm": 0.7747913705562472, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 25903 + }, + { + "epoch": 0.25904, + "grad_norm": 1.0656976329485086, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 25904 + }, + { + "epoch": 0.25905, + "grad_norm": 1.2563630347617631, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 25905 + }, + { + "epoch": 0.25906, + "grad_norm": 0.8854321809695271, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 25906 + }, + { + "epoch": 0.25907, + "grad_norm": 0.8059072358321804, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 25907 + }, + { + "epoch": 0.25908, + "grad_norm": 0.8138147215454772, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 25908 + }, + { + "epoch": 0.25909, + "grad_norm": 0.7817918998199807, + "learning_rate": 0.003, + "loss": 4.061, + "step": 25909 + }, + { + "epoch": 0.2591, + "grad_norm": 0.736903925131948, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 25910 + }, + { + "epoch": 0.25911, + "grad_norm": 0.7987525143998383, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 25911 + }, + { + "epoch": 0.25912, + "grad_norm": 0.835157078093261, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 25912 + }, + { + "epoch": 0.25913, + "grad_norm": 0.8077293035560028, + "learning_rate": 0.003, + "loss": 4.026, + "step": 25913 + }, + { + "epoch": 0.25914, + "grad_norm": 0.7429075380762054, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 25914 + }, + { + "epoch": 0.25915, + "grad_norm": 0.6890816593703376, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 25915 + }, + { + "epoch": 0.25916, + "grad_norm": 0.7534742554035114, + "learning_rate": 0.003, + "loss": 4.059, + "step": 25916 + }, + { + "epoch": 0.25917, + "grad_norm": 0.9248898216498914, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 25917 + }, + { + "epoch": 0.25918, + "grad_norm": 1.225960246634099, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 25918 + }, + { + "epoch": 0.25919, + "grad_norm": 1.107042752603581, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 25919 + }, + { + "epoch": 0.2592, + "grad_norm": 0.8927881538546854, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 25920 + }, + { + "epoch": 0.25921, + "grad_norm": 0.9997249781762862, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 25921 + }, + { + "epoch": 0.25922, + "grad_norm": 1.118074978306857, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 25922 + }, + { + "epoch": 0.25923, + "grad_norm": 0.9360974885277822, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 25923 + }, + { + "epoch": 0.25924, + "grad_norm": 0.9429028883868438, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 25924 + }, + { + "epoch": 0.25925, + "grad_norm": 0.8601733760442539, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 25925 + }, + { + "epoch": 0.25926, + "grad_norm": 0.8769605528666334, + "learning_rate": 0.003, + "loss": 4.054, + "step": 25926 + }, + { + "epoch": 0.25927, + "grad_norm": 0.8481890702461181, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 25927 + }, + { + "epoch": 0.25928, + "grad_norm": 0.8661471111995374, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 25928 + }, + { + "epoch": 0.25929, + "grad_norm": 0.7393976948562561, + "learning_rate": 0.003, + "loss": 4.048, + "step": 25929 + }, + { + "epoch": 0.2593, + "grad_norm": 0.7644322353547663, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 25930 + }, + { + "epoch": 0.25931, + "grad_norm": 0.7570464830441276, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 25931 + }, + { + "epoch": 0.25932, + "grad_norm": 0.7584386440374044, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 25932 + }, + { + "epoch": 0.25933, + "grad_norm": 0.8084914853375832, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 25933 + }, + { + "epoch": 0.25934, + "grad_norm": 0.9295254742130984, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 25934 + }, + { + "epoch": 0.25935, + "grad_norm": 0.9964237938787681, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 25935 + }, + { + "epoch": 0.25936, + "grad_norm": 0.9839443778873309, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 25936 + }, + { + "epoch": 0.25937, + "grad_norm": 0.9517870221082645, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 25937 + }, + { + "epoch": 0.25938, + "grad_norm": 0.8718998656602899, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 25938 + }, + { + "epoch": 0.25939, + "grad_norm": 0.8505759838773718, + "learning_rate": 0.003, + "loss": 4.053, + "step": 25939 + }, + { + "epoch": 0.2594, + "grad_norm": 0.7463630794894784, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 25940 + }, + { + "epoch": 0.25941, + "grad_norm": 0.626229569035502, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 25941 + }, + { + "epoch": 0.25942, + "grad_norm": 0.49408710913084736, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 25942 + }, + { + "epoch": 0.25943, + "grad_norm": 0.5257735751132183, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 25943 + }, + { + "epoch": 0.25944, + "grad_norm": 0.5924549057047243, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 25944 + }, + { + "epoch": 0.25945, + "grad_norm": 0.6983411196812239, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 25945 + }, + { + "epoch": 0.25946, + "grad_norm": 0.888046310634318, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 25946 + }, + { + "epoch": 0.25947, + "grad_norm": 1.1124840738111583, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 25947 + }, + { + "epoch": 0.25948, + "grad_norm": 1.228698154302832, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 25948 + }, + { + "epoch": 0.25949, + "grad_norm": 0.9747231765418726, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 25949 + }, + { + "epoch": 0.2595, + "grad_norm": 0.9428329202593738, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 25950 + }, + { + "epoch": 0.25951, + "grad_norm": 1.181130542815705, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 25951 + }, + { + "epoch": 0.25952, + "grad_norm": 1.029283320988199, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 25952 + }, + { + "epoch": 0.25953, + "grad_norm": 0.9231522500756801, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 25953 + }, + { + "epoch": 0.25954, + "grad_norm": 0.9145587183891746, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 25954 + }, + { + "epoch": 0.25955, + "grad_norm": 0.8742515667345413, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 25955 + }, + { + "epoch": 0.25956, + "grad_norm": 0.8784317182165235, + "learning_rate": 0.003, + "loss": 4.05, + "step": 25956 + }, + { + "epoch": 0.25957, + "grad_norm": 0.7918525254031997, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 25957 + }, + { + "epoch": 0.25958, + "grad_norm": 0.77294678671553, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 25958 + }, + { + "epoch": 0.25959, + "grad_norm": 0.7908012745806853, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 25959 + }, + { + "epoch": 0.2596, + "grad_norm": 0.7282710268878122, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 25960 + }, + { + "epoch": 0.25961, + "grad_norm": 0.7346603375071983, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 25961 + }, + { + "epoch": 0.25962, + "grad_norm": 0.7603862860837758, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 25962 + }, + { + "epoch": 0.25963, + "grad_norm": 0.7799600352324344, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 25963 + }, + { + "epoch": 0.25964, + "grad_norm": 0.9075938870325732, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 25964 + }, + { + "epoch": 0.25965, + "grad_norm": 1.129640597125766, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 25965 + }, + { + "epoch": 0.25966, + "grad_norm": 1.2533015232673987, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 25966 + }, + { + "epoch": 0.25967, + "grad_norm": 0.7429426303594748, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 25967 + }, + { + "epoch": 0.25968, + "grad_norm": 0.6540424254883969, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 25968 + }, + { + "epoch": 0.25969, + "grad_norm": 0.647684024516191, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 25969 + }, + { + "epoch": 0.2597, + "grad_norm": 0.6269519995778846, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 25970 + }, + { + "epoch": 0.25971, + "grad_norm": 0.5695166804661257, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 25971 + }, + { + "epoch": 0.25972, + "grad_norm": 0.6019006743048572, + "learning_rate": 0.003, + "loss": 4.0003, + "step": 25972 + }, + { + "epoch": 0.25973, + "grad_norm": 0.6422169016009158, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 25973 + }, + { + "epoch": 0.25974, + "grad_norm": 0.739067701987218, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 25974 + }, + { + "epoch": 0.25975, + "grad_norm": 1.0781997776379828, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 25975 + }, + { + "epoch": 0.25976, + "grad_norm": 1.3148599174145454, + "learning_rate": 0.003, + "loss": 4.078, + "step": 25976 + }, + { + "epoch": 0.25977, + "grad_norm": 0.7925418266882831, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 25977 + }, + { + "epoch": 0.25978, + "grad_norm": 0.7807032651954832, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 25978 + }, + { + "epoch": 0.25979, + "grad_norm": 0.7000639361401566, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 25979 + }, + { + "epoch": 0.2598, + "grad_norm": 0.64633918234036, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 25980 + }, + { + "epoch": 0.25981, + "grad_norm": 0.7300634029945527, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 25981 + }, + { + "epoch": 0.25982, + "grad_norm": 0.7933567370638227, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 25982 + }, + { + "epoch": 0.25983, + "grad_norm": 0.8403981009703373, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 25983 + }, + { + "epoch": 0.25984, + "grad_norm": 0.8285355489614983, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 25984 + }, + { + "epoch": 0.25985, + "grad_norm": 0.7728346434405934, + "learning_rate": 0.003, + "loss": 4.0055, + "step": 25985 + }, + { + "epoch": 0.25986, + "grad_norm": 0.8797368426171809, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 25986 + }, + { + "epoch": 0.25987, + "grad_norm": 1.3317383596723136, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 25987 + }, + { + "epoch": 0.25988, + "grad_norm": 0.7740636664422671, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 25988 + }, + { + "epoch": 0.25989, + "grad_norm": 0.6182379072113148, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 25989 + }, + { + "epoch": 0.2599, + "grad_norm": 0.712512953438655, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 25990 + }, + { + "epoch": 0.25991, + "grad_norm": 0.7915284614381275, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 25991 + }, + { + "epoch": 0.25992, + "grad_norm": 0.88120967077237, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 25992 + }, + { + "epoch": 0.25993, + "grad_norm": 1.013443159211642, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 25993 + }, + { + "epoch": 0.25994, + "grad_norm": 1.0538616180157208, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 25994 + }, + { + "epoch": 0.25995, + "grad_norm": 0.8287732721296354, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 25995 + }, + { + "epoch": 0.25996, + "grad_norm": 0.753926989635847, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 25996 + }, + { + "epoch": 0.25997, + "grad_norm": 0.700423687167093, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 25997 + }, + { + "epoch": 0.25998, + "grad_norm": 0.7266771733902084, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 25998 + }, + { + "epoch": 0.25999, + "grad_norm": 0.871108245073119, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 25999 + }, + { + "epoch": 0.26, + "grad_norm": 1.2009645309298072, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 26000 + }, + { + "epoch": 0.26001, + "grad_norm": 1.2097418553810149, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 26001 + }, + { + "epoch": 0.26002, + "grad_norm": 0.8352746471524711, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 26002 + }, + { + "epoch": 0.26003, + "grad_norm": 0.7338772790170216, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 26003 + }, + { + "epoch": 0.26004, + "grad_norm": 0.7757394096883197, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 26004 + }, + { + "epoch": 0.26005, + "grad_norm": 0.8829887573540036, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 26005 + }, + { + "epoch": 0.26006, + "grad_norm": 0.9329015266747533, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 26006 + }, + { + "epoch": 0.26007, + "grad_norm": 0.965185850757229, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 26007 + }, + { + "epoch": 0.26008, + "grad_norm": 1.0040216341454218, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 26008 + }, + { + "epoch": 0.26009, + "grad_norm": 0.9504447047903598, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 26009 + }, + { + "epoch": 0.2601, + "grad_norm": 0.8408871255891638, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 26010 + }, + { + "epoch": 0.26011, + "grad_norm": 0.9130050950447212, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 26011 + }, + { + "epoch": 0.26012, + "grad_norm": 1.1612430050413889, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 26012 + }, + { + "epoch": 0.26013, + "grad_norm": 1.273213791480937, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 26013 + }, + { + "epoch": 0.26014, + "grad_norm": 0.7340124097636823, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 26014 + }, + { + "epoch": 0.26015, + "grad_norm": 0.697058503789482, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 26015 + }, + { + "epoch": 0.26016, + "grad_norm": 0.7251512152129509, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 26016 + }, + { + "epoch": 0.26017, + "grad_norm": 0.8297913356683271, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 26017 + }, + { + "epoch": 0.26018, + "grad_norm": 0.8372902861293796, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 26018 + }, + { + "epoch": 0.26019, + "grad_norm": 0.7562682980642035, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 26019 + }, + { + "epoch": 0.2602, + "grad_norm": 0.8254067989960779, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 26020 + }, + { + "epoch": 0.26021, + "grad_norm": 0.9526137880010397, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 26021 + }, + { + "epoch": 0.26022, + "grad_norm": 0.9736319958034201, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 26022 + }, + { + "epoch": 0.26023, + "grad_norm": 0.9702953713394484, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 26023 + }, + { + "epoch": 0.26024, + "grad_norm": 1.098826588956019, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 26024 + }, + { + "epoch": 0.26025, + "grad_norm": 0.8332824081363539, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 26025 + }, + { + "epoch": 0.26026, + "grad_norm": 0.843185822311754, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 26026 + }, + { + "epoch": 0.26027, + "grad_norm": 1.0066888019556928, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 26027 + }, + { + "epoch": 0.26028, + "grad_norm": 1.2365332437837289, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 26028 + }, + { + "epoch": 0.26029, + "grad_norm": 0.8474961503811502, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 26029 + }, + { + "epoch": 0.2603, + "grad_norm": 0.8377934579253402, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 26030 + }, + { + "epoch": 0.26031, + "grad_norm": 0.9188469687326098, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 26031 + }, + { + "epoch": 0.26032, + "grad_norm": 0.9687397684167955, + "learning_rate": 0.003, + "loss": 4.073, + "step": 26032 + }, + { + "epoch": 0.26033, + "grad_norm": 1.0865205811001855, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 26033 + }, + { + "epoch": 0.26034, + "grad_norm": 0.9555354968937448, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 26034 + }, + { + "epoch": 0.26035, + "grad_norm": 0.9097132353144967, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 26035 + }, + { + "epoch": 0.26036, + "grad_norm": 0.9316146616774199, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 26036 + }, + { + "epoch": 0.26037, + "grad_norm": 0.926009909966239, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 26037 + }, + { + "epoch": 0.26038, + "grad_norm": 0.9256877583114714, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 26038 + }, + { + "epoch": 0.26039, + "grad_norm": 0.9784970497027995, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 26039 + }, + { + "epoch": 0.2604, + "grad_norm": 1.0993479034786777, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 26040 + }, + { + "epoch": 0.26041, + "grad_norm": 0.985593785836954, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 26041 + }, + { + "epoch": 0.26042, + "grad_norm": 1.0330897354353592, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 26042 + }, + { + "epoch": 0.26043, + "grad_norm": 0.8675892979754002, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 26043 + }, + { + "epoch": 0.26044, + "grad_norm": 0.8615616200991423, + "learning_rate": 0.003, + "loss": 4.081, + "step": 26044 + }, + { + "epoch": 0.26045, + "grad_norm": 0.835683431740189, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 26045 + }, + { + "epoch": 0.26046, + "grad_norm": 1.036041550582916, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 26046 + }, + { + "epoch": 0.26047, + "grad_norm": 1.1675796906912497, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 26047 + }, + { + "epoch": 0.26048, + "grad_norm": 0.8528364773244653, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 26048 + }, + { + "epoch": 0.26049, + "grad_norm": 0.6994713501558297, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 26049 + }, + { + "epoch": 0.2605, + "grad_norm": 0.77891095079453, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 26050 + }, + { + "epoch": 0.26051, + "grad_norm": 0.7596221570654217, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 26051 + }, + { + "epoch": 0.26052, + "grad_norm": 0.8673844761868408, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 26052 + }, + { + "epoch": 0.26053, + "grad_norm": 0.962025463777746, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 26053 + }, + { + "epoch": 0.26054, + "grad_norm": 0.9701191897599432, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 26054 + }, + { + "epoch": 0.26055, + "grad_norm": 1.187393061210973, + "learning_rate": 0.003, + "loss": 4.072, + "step": 26055 + }, + { + "epoch": 0.26056, + "grad_norm": 1.003114948348615, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 26056 + }, + { + "epoch": 0.26057, + "grad_norm": 0.8812584057119388, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 26057 + }, + { + "epoch": 0.26058, + "grad_norm": 0.9409559852394075, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 26058 + }, + { + "epoch": 0.26059, + "grad_norm": 1.0709931655234577, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 26059 + }, + { + "epoch": 0.2606, + "grad_norm": 0.9630774659088162, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 26060 + }, + { + "epoch": 0.26061, + "grad_norm": 1.016432703663005, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 26061 + }, + { + "epoch": 0.26062, + "grad_norm": 1.0353506780361699, + "learning_rate": 0.003, + "loss": 4.022, + "step": 26062 + }, + { + "epoch": 0.26063, + "grad_norm": 0.8411446179884322, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 26063 + }, + { + "epoch": 0.26064, + "grad_norm": 0.7113642924166592, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 26064 + }, + { + "epoch": 0.26065, + "grad_norm": 0.5496002794049029, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 26065 + }, + { + "epoch": 0.26066, + "grad_norm": 0.5486693501352954, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 26066 + }, + { + "epoch": 0.26067, + "grad_norm": 0.5370418031320295, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 26067 + }, + { + "epoch": 0.26068, + "grad_norm": 0.46506180992826696, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 26068 + }, + { + "epoch": 0.26069, + "grad_norm": 0.4760793267323303, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 26069 + }, + { + "epoch": 0.2607, + "grad_norm": 0.5179597923720156, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 26070 + }, + { + "epoch": 0.26071, + "grad_norm": 0.5236738804807307, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 26071 + }, + { + "epoch": 0.26072, + "grad_norm": 0.5466667465663434, + "learning_rate": 0.003, + "loss": 3.9953, + "step": 26072 + }, + { + "epoch": 0.26073, + "grad_norm": 0.6466389647249953, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 26073 + }, + { + "epoch": 0.26074, + "grad_norm": 0.7534005417879434, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 26074 + }, + { + "epoch": 0.26075, + "grad_norm": 0.8601144925728131, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 26075 + }, + { + "epoch": 0.26076, + "grad_norm": 0.9531081274450106, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 26076 + }, + { + "epoch": 0.26077, + "grad_norm": 0.7739487192822412, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 26077 + }, + { + "epoch": 0.26078, + "grad_norm": 0.7751696289825718, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 26078 + }, + { + "epoch": 0.26079, + "grad_norm": 0.9032741244597159, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 26079 + }, + { + "epoch": 0.2608, + "grad_norm": 1.1918145156003386, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 26080 + }, + { + "epoch": 0.26081, + "grad_norm": 1.0750482025945935, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 26081 + }, + { + "epoch": 0.26082, + "grad_norm": 0.9504724947176573, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 26082 + }, + { + "epoch": 0.26083, + "grad_norm": 0.9287246561891422, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 26083 + }, + { + "epoch": 0.26084, + "grad_norm": 0.9045879816655447, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 26084 + }, + { + "epoch": 0.26085, + "grad_norm": 0.9018666372160579, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 26085 + }, + { + "epoch": 0.26086, + "grad_norm": 0.795614001086891, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 26086 + }, + { + "epoch": 0.26087, + "grad_norm": 0.8571378805568793, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 26087 + }, + { + "epoch": 0.26088, + "grad_norm": 0.9868071123234646, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 26088 + }, + { + "epoch": 0.26089, + "grad_norm": 1.0702366253554527, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 26089 + }, + { + "epoch": 0.2609, + "grad_norm": 0.8855751449590396, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 26090 + }, + { + "epoch": 0.26091, + "grad_norm": 0.8961938798809969, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 26091 + }, + { + "epoch": 0.26092, + "grad_norm": 0.8155684210360971, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 26092 + }, + { + "epoch": 0.26093, + "grad_norm": 0.8211536793960583, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 26093 + }, + { + "epoch": 0.26094, + "grad_norm": 0.7358550554289822, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 26094 + }, + { + "epoch": 0.26095, + "grad_norm": 0.6850162783000483, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 26095 + }, + { + "epoch": 0.26096, + "grad_norm": 0.8040050321004595, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 26096 + }, + { + "epoch": 0.26097, + "grad_norm": 1.0008675953964583, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 26097 + }, + { + "epoch": 0.26098, + "grad_norm": 1.2297217774394595, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 26098 + }, + { + "epoch": 0.26099, + "grad_norm": 0.9678521083940245, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 26099 + }, + { + "epoch": 0.261, + "grad_norm": 1.0434990907975106, + "learning_rate": 0.003, + "loss": 4.042, + "step": 26100 + }, + { + "epoch": 0.26101, + "grad_norm": 1.0267214274996164, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 26101 + }, + { + "epoch": 0.26102, + "grad_norm": 0.9678018786810181, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 26102 + }, + { + "epoch": 0.26103, + "grad_norm": 0.9302899086120066, + "learning_rate": 0.003, + "loss": 4.083, + "step": 26103 + }, + { + "epoch": 0.26104, + "grad_norm": 0.8949080051980559, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 26104 + }, + { + "epoch": 0.26105, + "grad_norm": 0.8318899747996377, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 26105 + }, + { + "epoch": 0.26106, + "grad_norm": 0.8818053063582686, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 26106 + }, + { + "epoch": 0.26107, + "grad_norm": 0.8848169930994063, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 26107 + }, + { + "epoch": 0.26108, + "grad_norm": 0.9860821520833363, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 26108 + }, + { + "epoch": 0.26109, + "grad_norm": 1.0504841548627208, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 26109 + }, + { + "epoch": 0.2611, + "grad_norm": 1.0465797551518605, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 26110 + }, + { + "epoch": 0.26111, + "grad_norm": 1.0489188083218413, + "learning_rate": 0.003, + "loss": 4.036, + "step": 26111 + }, + { + "epoch": 0.26112, + "grad_norm": 1.028654500705115, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 26112 + }, + { + "epoch": 0.26113, + "grad_norm": 1.122170536375792, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 26113 + }, + { + "epoch": 0.26114, + "grad_norm": 0.7577094639098627, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 26114 + }, + { + "epoch": 0.26115, + "grad_norm": 0.7327647971810204, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 26115 + }, + { + "epoch": 0.26116, + "grad_norm": 0.6981746510064534, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 26116 + }, + { + "epoch": 0.26117, + "grad_norm": 0.7053277219712358, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 26117 + }, + { + "epoch": 0.26118, + "grad_norm": 0.6520060481545832, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 26118 + }, + { + "epoch": 0.26119, + "grad_norm": 0.6403097855909565, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 26119 + }, + { + "epoch": 0.2612, + "grad_norm": 0.6293790481219621, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 26120 + }, + { + "epoch": 0.26121, + "grad_norm": 0.6739906758605874, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 26121 + }, + { + "epoch": 0.26122, + "grad_norm": 0.6769967545240873, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 26122 + }, + { + "epoch": 0.26123, + "grad_norm": 0.6271804544581302, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 26123 + }, + { + "epoch": 0.26124, + "grad_norm": 0.6248084816110033, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 26124 + }, + { + "epoch": 0.26125, + "grad_norm": 0.7015609503084236, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 26125 + }, + { + "epoch": 0.26126, + "grad_norm": 0.7173271461637023, + "learning_rate": 0.003, + "loss": 4.038, + "step": 26126 + }, + { + "epoch": 0.26127, + "grad_norm": 0.76880893259204, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 26127 + }, + { + "epoch": 0.26128, + "grad_norm": 0.8546971782805463, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 26128 + }, + { + "epoch": 0.26129, + "grad_norm": 0.9262169223679229, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 26129 + }, + { + "epoch": 0.2613, + "grad_norm": 1.209562202995274, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 26130 + }, + { + "epoch": 0.26131, + "grad_norm": 0.9837295238600589, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 26131 + }, + { + "epoch": 0.26132, + "grad_norm": 1.2164612596155162, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 26132 + }, + { + "epoch": 0.26133, + "grad_norm": 0.9052805546299899, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 26133 + }, + { + "epoch": 0.26134, + "grad_norm": 0.7958188646132287, + "learning_rate": 0.003, + "loss": 4.049, + "step": 26134 + }, + { + "epoch": 0.26135, + "grad_norm": 0.7228169526352054, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 26135 + }, + { + "epoch": 0.26136, + "grad_norm": 0.822663743890147, + "learning_rate": 0.003, + "loss": 4.044, + "step": 26136 + }, + { + "epoch": 0.26137, + "grad_norm": 0.8900026489173738, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 26137 + }, + { + "epoch": 0.26138, + "grad_norm": 0.9498592760010331, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 26138 + }, + { + "epoch": 0.26139, + "grad_norm": 1.0160936820117044, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 26139 + }, + { + "epoch": 0.2614, + "grad_norm": 1.0441574991131934, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 26140 + }, + { + "epoch": 0.26141, + "grad_norm": 1.0778389621924822, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 26141 + }, + { + "epoch": 0.26142, + "grad_norm": 0.9421381994374114, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 26142 + }, + { + "epoch": 0.26143, + "grad_norm": 0.8478442866010983, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 26143 + }, + { + "epoch": 0.26144, + "grad_norm": 0.9981132603959519, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 26144 + }, + { + "epoch": 0.26145, + "grad_norm": 1.4113166116982694, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 26145 + }, + { + "epoch": 0.26146, + "grad_norm": 0.7299746261314736, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 26146 + }, + { + "epoch": 0.26147, + "grad_norm": 0.7337085940973237, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 26147 + }, + { + "epoch": 0.26148, + "grad_norm": 0.7967423526101783, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 26148 + }, + { + "epoch": 0.26149, + "grad_norm": 0.9571080840385005, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 26149 + }, + { + "epoch": 0.2615, + "grad_norm": 1.2442136447377572, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 26150 + }, + { + "epoch": 0.26151, + "grad_norm": 0.7451181392919868, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 26151 + }, + { + "epoch": 0.26152, + "grad_norm": 0.7971577003875165, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 26152 + }, + { + "epoch": 0.26153, + "grad_norm": 0.9217824954397043, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 26153 + }, + { + "epoch": 0.26154, + "grad_norm": 1.0535173348144642, + "learning_rate": 0.003, + "loss": 4.056, + "step": 26154 + }, + { + "epoch": 0.26155, + "grad_norm": 0.9875766000485343, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 26155 + }, + { + "epoch": 0.26156, + "grad_norm": 0.870727216162887, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 26156 + }, + { + "epoch": 0.26157, + "grad_norm": 0.8974823005065079, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 26157 + }, + { + "epoch": 0.26158, + "grad_norm": 0.8342047578780025, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 26158 + }, + { + "epoch": 0.26159, + "grad_norm": 0.871464791418511, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 26159 + }, + { + "epoch": 0.2616, + "grad_norm": 0.9328411279255361, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 26160 + }, + { + "epoch": 0.26161, + "grad_norm": 0.9795277450175764, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 26161 + }, + { + "epoch": 0.26162, + "grad_norm": 1.0699961638310336, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 26162 + }, + { + "epoch": 0.26163, + "grad_norm": 1.1135678867890897, + "learning_rate": 0.003, + "loss": 4.049, + "step": 26163 + }, + { + "epoch": 0.26164, + "grad_norm": 0.9913040261567545, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 26164 + }, + { + "epoch": 0.26165, + "grad_norm": 1.0709051919317556, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 26165 + }, + { + "epoch": 0.26166, + "grad_norm": 0.9786837065347199, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 26166 + }, + { + "epoch": 0.26167, + "grad_norm": 0.7946176374456817, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 26167 + }, + { + "epoch": 0.26168, + "grad_norm": 0.7487414672438271, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 26168 + }, + { + "epoch": 0.26169, + "grad_norm": 0.622999121167594, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 26169 + }, + { + "epoch": 0.2617, + "grad_norm": 0.6523425293751464, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 26170 + }, + { + "epoch": 0.26171, + "grad_norm": 0.6278395896515819, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 26171 + }, + { + "epoch": 0.26172, + "grad_norm": 0.6654667915237673, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 26172 + }, + { + "epoch": 0.26173, + "grad_norm": 0.6911176692910286, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 26173 + }, + { + "epoch": 0.26174, + "grad_norm": 0.7927240238585901, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 26174 + }, + { + "epoch": 0.26175, + "grad_norm": 0.8665871166802704, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 26175 + }, + { + "epoch": 0.26176, + "grad_norm": 0.8554582668413623, + "learning_rate": 0.003, + "loss": 4.055, + "step": 26176 + }, + { + "epoch": 0.26177, + "grad_norm": 0.8548705452997132, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 26177 + }, + { + "epoch": 0.26178, + "grad_norm": 0.811262256766966, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 26178 + }, + { + "epoch": 0.26179, + "grad_norm": 0.8672019491786611, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 26179 + }, + { + "epoch": 0.2618, + "grad_norm": 1.0226958696872295, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 26180 + }, + { + "epoch": 0.26181, + "grad_norm": 1.1288692742829252, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 26181 + }, + { + "epoch": 0.26182, + "grad_norm": 0.8351683694338491, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 26182 + }, + { + "epoch": 0.26183, + "grad_norm": 0.8036886678071687, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 26183 + }, + { + "epoch": 0.26184, + "grad_norm": 0.7596885242566551, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 26184 + }, + { + "epoch": 0.26185, + "grad_norm": 0.7761230407309635, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 26185 + }, + { + "epoch": 0.26186, + "grad_norm": 0.8463799400896802, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 26186 + }, + { + "epoch": 0.26187, + "grad_norm": 0.8452550112625654, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 26187 + }, + { + "epoch": 0.26188, + "grad_norm": 0.8819258247251434, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 26188 + }, + { + "epoch": 0.26189, + "grad_norm": 1.0704953500861512, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 26189 + }, + { + "epoch": 0.2619, + "grad_norm": 1.2149785995897582, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 26190 + }, + { + "epoch": 0.26191, + "grad_norm": 0.7742188476812769, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 26191 + }, + { + "epoch": 0.26192, + "grad_norm": 0.7510838877588842, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 26192 + }, + { + "epoch": 0.26193, + "grad_norm": 0.8361943717285526, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 26193 + }, + { + "epoch": 0.26194, + "grad_norm": 0.9338079918428045, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 26194 + }, + { + "epoch": 0.26195, + "grad_norm": 0.8596427759634822, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 26195 + }, + { + "epoch": 0.26196, + "grad_norm": 0.9016987025684433, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 26196 + }, + { + "epoch": 0.26197, + "grad_norm": 0.967231069974892, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 26197 + }, + { + "epoch": 0.26198, + "grad_norm": 0.9288960270670875, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 26198 + }, + { + "epoch": 0.26199, + "grad_norm": 1.0255603408997926, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 26199 + }, + { + "epoch": 0.262, + "grad_norm": 1.0640304321704845, + "learning_rate": 0.003, + "loss": 4.081, + "step": 26200 + }, + { + "epoch": 0.26201, + "grad_norm": 0.8324910995753343, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 26201 + }, + { + "epoch": 0.26202, + "grad_norm": 0.893605807412069, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 26202 + }, + { + "epoch": 0.26203, + "grad_norm": 1.05870122480406, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 26203 + }, + { + "epoch": 0.26204, + "grad_norm": 0.9771788798866383, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 26204 + }, + { + "epoch": 0.26205, + "grad_norm": 0.8401702345381304, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 26205 + }, + { + "epoch": 0.26206, + "grad_norm": 0.710617038260591, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 26206 + }, + { + "epoch": 0.26207, + "grad_norm": 0.6238455966692761, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 26207 + }, + { + "epoch": 0.26208, + "grad_norm": 0.6600554828945892, + "learning_rate": 0.003, + "loss": 4.038, + "step": 26208 + }, + { + "epoch": 0.26209, + "grad_norm": 0.7162131750535364, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 26209 + }, + { + "epoch": 0.2621, + "grad_norm": 0.69481936448279, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 26210 + }, + { + "epoch": 0.26211, + "grad_norm": 0.6786574098826635, + "learning_rate": 0.003, + "loss": 3.9968, + "step": 26211 + }, + { + "epoch": 0.26212, + "grad_norm": 0.7168254288215564, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 26212 + }, + { + "epoch": 0.26213, + "grad_norm": 0.6548952978420925, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 26213 + }, + { + "epoch": 0.26214, + "grad_norm": 0.6887107235912903, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 26214 + }, + { + "epoch": 0.26215, + "grad_norm": 0.7664994610839558, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 26215 + }, + { + "epoch": 0.26216, + "grad_norm": 0.8945874303937961, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 26216 + }, + { + "epoch": 0.26217, + "grad_norm": 1.1132401905276665, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 26217 + }, + { + "epoch": 0.26218, + "grad_norm": 1.0629038939921511, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 26218 + }, + { + "epoch": 0.26219, + "grad_norm": 1.0615366999980875, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 26219 + }, + { + "epoch": 0.2622, + "grad_norm": 1.008818814954345, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 26220 + }, + { + "epoch": 0.26221, + "grad_norm": 0.9650317343948729, + "learning_rate": 0.003, + "loss": 4.063, + "step": 26221 + }, + { + "epoch": 0.26222, + "grad_norm": 1.1103801658025791, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 26222 + }, + { + "epoch": 0.26223, + "grad_norm": 1.0145793555295273, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 26223 + }, + { + "epoch": 0.26224, + "grad_norm": 0.9732576958500327, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 26224 + }, + { + "epoch": 0.26225, + "grad_norm": 0.9401817363836049, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 26225 + }, + { + "epoch": 0.26226, + "grad_norm": 0.8462411326042419, + "learning_rate": 0.003, + "loss": 4.072, + "step": 26226 + }, + { + "epoch": 0.26227, + "grad_norm": 0.8273540810131589, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 26227 + }, + { + "epoch": 0.26228, + "grad_norm": 0.7856260091465705, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 26228 + }, + { + "epoch": 0.26229, + "grad_norm": 0.8135587875147215, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 26229 + }, + { + "epoch": 0.2623, + "grad_norm": 1.0124533895174466, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 26230 + }, + { + "epoch": 0.26231, + "grad_norm": 1.3424260946475752, + "learning_rate": 0.003, + "loss": 4.081, + "step": 26231 + }, + { + "epoch": 0.26232, + "grad_norm": 0.8872451637430662, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 26232 + }, + { + "epoch": 0.26233, + "grad_norm": 0.7754398705892791, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 26233 + }, + { + "epoch": 0.26234, + "grad_norm": 0.6529116539139486, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 26234 + }, + { + "epoch": 0.26235, + "grad_norm": 0.6422858716029861, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 26235 + }, + { + "epoch": 0.26236, + "grad_norm": 0.6175926059224309, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 26236 + }, + { + "epoch": 0.26237, + "grad_norm": 0.6940176293482264, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 26237 + }, + { + "epoch": 0.26238, + "grad_norm": 0.7484724898267537, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 26238 + }, + { + "epoch": 0.26239, + "grad_norm": 0.7748681911845664, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 26239 + }, + { + "epoch": 0.2624, + "grad_norm": 0.8185752623992257, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 26240 + }, + { + "epoch": 0.26241, + "grad_norm": 0.7735752237594188, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 26241 + }, + { + "epoch": 0.26242, + "grad_norm": 0.7142137024420664, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 26242 + }, + { + "epoch": 0.26243, + "grad_norm": 0.6691683193115119, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 26243 + }, + { + "epoch": 0.26244, + "grad_norm": 0.7120853425076942, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 26244 + }, + { + "epoch": 0.26245, + "grad_norm": 0.7004128980488831, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 26245 + }, + { + "epoch": 0.26246, + "grad_norm": 0.7831553544282576, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 26246 + }, + { + "epoch": 0.26247, + "grad_norm": 1.1322478964914258, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 26247 + }, + { + "epoch": 0.26248, + "grad_norm": 1.1843122835277173, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 26248 + }, + { + "epoch": 0.26249, + "grad_norm": 0.8871184361582932, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 26249 + }, + { + "epoch": 0.2625, + "grad_norm": 0.8031743251717228, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 26250 + }, + { + "epoch": 0.26251, + "grad_norm": 0.7044472498809049, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 26251 + }, + { + "epoch": 0.26252, + "grad_norm": 0.7487721964501101, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 26252 + }, + { + "epoch": 0.26253, + "grad_norm": 0.8621985479181946, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 26253 + }, + { + "epoch": 0.26254, + "grad_norm": 1.061993957823432, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 26254 + }, + { + "epoch": 0.26255, + "grad_norm": 1.1407790435576035, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 26255 + }, + { + "epoch": 0.26256, + "grad_norm": 0.8616823012900878, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 26256 + }, + { + "epoch": 0.26257, + "grad_norm": 0.9025329451937258, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 26257 + }, + { + "epoch": 0.26258, + "grad_norm": 1.0079900688881447, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 26258 + }, + { + "epoch": 0.26259, + "grad_norm": 1.1295140187489214, + "learning_rate": 0.003, + "loss": 4.04, + "step": 26259 + }, + { + "epoch": 0.2626, + "grad_norm": 0.790986819904, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 26260 + }, + { + "epoch": 0.26261, + "grad_norm": 0.7521691366434736, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 26261 + }, + { + "epoch": 0.26262, + "grad_norm": 0.7301074122856195, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 26262 + }, + { + "epoch": 0.26263, + "grad_norm": 0.7896891443906977, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 26263 + }, + { + "epoch": 0.26264, + "grad_norm": 0.9550362655461249, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 26264 + }, + { + "epoch": 0.26265, + "grad_norm": 1.197437017414975, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 26265 + }, + { + "epoch": 0.26266, + "grad_norm": 1.207926531338263, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 26266 + }, + { + "epoch": 0.26267, + "grad_norm": 0.9167232558741086, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 26267 + }, + { + "epoch": 0.26268, + "grad_norm": 0.9163525081509245, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 26268 + }, + { + "epoch": 0.26269, + "grad_norm": 1.0679378402947535, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 26269 + }, + { + "epoch": 0.2627, + "grad_norm": 0.9775004088351289, + "learning_rate": 0.003, + "loss": 4.041, + "step": 26270 + }, + { + "epoch": 0.26271, + "grad_norm": 0.8798112366115134, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 26271 + }, + { + "epoch": 0.26272, + "grad_norm": 0.8468671773409967, + "learning_rate": 0.003, + "loss": 4.048, + "step": 26272 + }, + { + "epoch": 0.26273, + "grad_norm": 0.8489561677812455, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 26273 + }, + { + "epoch": 0.26274, + "grad_norm": 0.8134026922098382, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 26274 + }, + { + "epoch": 0.26275, + "grad_norm": 0.920998239063929, + "learning_rate": 0.003, + "loss": 4.017, + "step": 26275 + }, + { + "epoch": 0.26276, + "grad_norm": 0.9647789386408926, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 26276 + }, + { + "epoch": 0.26277, + "grad_norm": 0.9093502748505667, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 26277 + }, + { + "epoch": 0.26278, + "grad_norm": 0.8114768874614045, + "learning_rate": 0.003, + "loss": 4.01, + "step": 26278 + }, + { + "epoch": 0.26279, + "grad_norm": 0.7694505761861851, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 26279 + }, + { + "epoch": 0.2628, + "grad_norm": 0.8628714682379884, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 26280 + }, + { + "epoch": 0.26281, + "grad_norm": 0.7811901650739501, + "learning_rate": 0.003, + "loss": 4.043, + "step": 26281 + }, + { + "epoch": 0.26282, + "grad_norm": 0.9238183684618442, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 26282 + }, + { + "epoch": 0.26283, + "grad_norm": 1.0915570172819549, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 26283 + }, + { + "epoch": 0.26284, + "grad_norm": 1.212763631289959, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 26284 + }, + { + "epoch": 0.26285, + "grad_norm": 0.74442577299926, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 26285 + }, + { + "epoch": 0.26286, + "grad_norm": 0.6725979852285053, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 26286 + }, + { + "epoch": 0.26287, + "grad_norm": 0.6577899737244505, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 26287 + }, + { + "epoch": 0.26288, + "grad_norm": 0.6457362773646927, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 26288 + }, + { + "epoch": 0.26289, + "grad_norm": 0.8286101723706436, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 26289 + }, + { + "epoch": 0.2629, + "grad_norm": 0.8679813901989959, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 26290 + }, + { + "epoch": 0.26291, + "grad_norm": 0.7165778630461782, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 26291 + }, + { + "epoch": 0.26292, + "grad_norm": 0.6984354100131633, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 26292 + }, + { + "epoch": 0.26293, + "grad_norm": 0.8256529659186522, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 26293 + }, + { + "epoch": 0.26294, + "grad_norm": 1.0901196600608516, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 26294 + }, + { + "epoch": 0.26295, + "grad_norm": 1.2113541274854491, + "learning_rate": 0.003, + "loss": 4.047, + "step": 26295 + }, + { + "epoch": 0.26296, + "grad_norm": 0.7920236954832262, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 26296 + }, + { + "epoch": 0.26297, + "grad_norm": 0.6582118160018114, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 26297 + }, + { + "epoch": 0.26298, + "grad_norm": 0.5970007620304251, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 26298 + }, + { + "epoch": 0.26299, + "grad_norm": 0.6278891863428021, + "learning_rate": 0.003, + "loss": 4.0064, + "step": 26299 + }, + { + "epoch": 0.263, + "grad_norm": 0.6279120283927472, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 26300 + }, + { + "epoch": 0.26301, + "grad_norm": 0.6091578889114159, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 26301 + }, + { + "epoch": 0.26302, + "grad_norm": 0.6728227960727188, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 26302 + }, + { + "epoch": 0.26303, + "grad_norm": 0.7225328872988693, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 26303 + }, + { + "epoch": 0.26304, + "grad_norm": 0.7150045251330915, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 26304 + }, + { + "epoch": 0.26305, + "grad_norm": 0.8965134883394871, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 26305 + }, + { + "epoch": 0.26306, + "grad_norm": 1.2208677908935968, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 26306 + }, + { + "epoch": 0.26307, + "grad_norm": 0.8682731274917596, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 26307 + }, + { + "epoch": 0.26308, + "grad_norm": 0.8922736464182072, + "learning_rate": 0.003, + "loss": 4.059, + "step": 26308 + }, + { + "epoch": 0.26309, + "grad_norm": 1.0760558985185473, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 26309 + }, + { + "epoch": 0.2631, + "grad_norm": 1.0676096075131747, + "learning_rate": 0.003, + "loss": 4.032, + "step": 26310 + }, + { + "epoch": 0.26311, + "grad_norm": 1.100043151002186, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 26311 + }, + { + "epoch": 0.26312, + "grad_norm": 1.0106332974486276, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 26312 + }, + { + "epoch": 0.26313, + "grad_norm": 1.0293597174838511, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 26313 + }, + { + "epoch": 0.26314, + "grad_norm": 0.9618948467981816, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 26314 + }, + { + "epoch": 0.26315, + "grad_norm": 1.0351998663337603, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 26315 + }, + { + "epoch": 0.26316, + "grad_norm": 0.9432089852548535, + "learning_rate": 0.003, + "loss": 4.061, + "step": 26316 + }, + { + "epoch": 0.26317, + "grad_norm": 0.9961526705061539, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 26317 + }, + { + "epoch": 0.26318, + "grad_norm": 1.1133722203640881, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 26318 + }, + { + "epoch": 0.26319, + "grad_norm": 0.7571019654191382, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 26319 + }, + { + "epoch": 0.2632, + "grad_norm": 0.768280436814758, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 26320 + }, + { + "epoch": 0.26321, + "grad_norm": 0.7590249305878454, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 26321 + }, + { + "epoch": 0.26322, + "grad_norm": 0.8143382745718001, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 26322 + }, + { + "epoch": 0.26323, + "grad_norm": 0.8721505023259459, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 26323 + }, + { + "epoch": 0.26324, + "grad_norm": 0.7406563134746335, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 26324 + }, + { + "epoch": 0.26325, + "grad_norm": 0.6945285002428451, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 26325 + }, + { + "epoch": 0.26326, + "grad_norm": 0.636047233918367, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 26326 + }, + { + "epoch": 0.26327, + "grad_norm": 0.654385451886071, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 26327 + }, + { + "epoch": 0.26328, + "grad_norm": 0.7383077330479317, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 26328 + }, + { + "epoch": 0.26329, + "grad_norm": 0.9451589602728051, + "learning_rate": 0.003, + "loss": 4.031, + "step": 26329 + }, + { + "epoch": 0.2633, + "grad_norm": 1.0790389165242444, + "learning_rate": 0.003, + "loss": 4.059, + "step": 26330 + }, + { + "epoch": 0.26331, + "grad_norm": 1.0465190465778913, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 26331 + }, + { + "epoch": 0.26332, + "grad_norm": 0.9378987592958155, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 26332 + }, + { + "epoch": 0.26333, + "grad_norm": 0.9786756403401775, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 26333 + }, + { + "epoch": 0.26334, + "grad_norm": 1.0588949194447288, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 26334 + }, + { + "epoch": 0.26335, + "grad_norm": 0.8342044133140121, + "learning_rate": 0.003, + "loss": 4.058, + "step": 26335 + }, + { + "epoch": 0.26336, + "grad_norm": 0.8507909769948382, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 26336 + }, + { + "epoch": 0.26337, + "grad_norm": 0.8209158519816615, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 26337 + }, + { + "epoch": 0.26338, + "grad_norm": 0.770291250093039, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 26338 + }, + { + "epoch": 0.26339, + "grad_norm": 0.9089123283005075, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 26339 + }, + { + "epoch": 0.2634, + "grad_norm": 1.1997642238070532, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 26340 + }, + { + "epoch": 0.26341, + "grad_norm": 0.9267601696290195, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 26341 + }, + { + "epoch": 0.26342, + "grad_norm": 0.9594968756001173, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 26342 + }, + { + "epoch": 0.26343, + "grad_norm": 1.0390175574294294, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 26343 + }, + { + "epoch": 0.26344, + "grad_norm": 1.1079896743378188, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 26344 + }, + { + "epoch": 0.26345, + "grad_norm": 0.8730707308834204, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 26345 + }, + { + "epoch": 0.26346, + "grad_norm": 0.8313016534289472, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 26346 + }, + { + "epoch": 0.26347, + "grad_norm": 0.8231652965647966, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 26347 + }, + { + "epoch": 0.26348, + "grad_norm": 0.8591856289309846, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 26348 + }, + { + "epoch": 0.26349, + "grad_norm": 0.8266083093254825, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 26349 + }, + { + "epoch": 0.2635, + "grad_norm": 0.9466743690274186, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 26350 + }, + { + "epoch": 0.26351, + "grad_norm": 1.0046860483149092, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 26351 + }, + { + "epoch": 0.26352, + "grad_norm": 1.1949230200591339, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 26352 + }, + { + "epoch": 0.26353, + "grad_norm": 0.8390139849462757, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 26353 + }, + { + "epoch": 0.26354, + "grad_norm": 0.7202718760362946, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 26354 + }, + { + "epoch": 0.26355, + "grad_norm": 0.8649998998430727, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 26355 + }, + { + "epoch": 0.26356, + "grad_norm": 0.8705055911584321, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 26356 + }, + { + "epoch": 0.26357, + "grad_norm": 0.8534090092226855, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 26357 + }, + { + "epoch": 0.26358, + "grad_norm": 0.9313325043444313, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 26358 + }, + { + "epoch": 0.26359, + "grad_norm": 1.1250027959631315, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 26359 + }, + { + "epoch": 0.2636, + "grad_norm": 1.0470279151080315, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 26360 + }, + { + "epoch": 0.26361, + "grad_norm": 1.02541984005741, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 26361 + }, + { + "epoch": 0.26362, + "grad_norm": 0.9602641336378365, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 26362 + }, + { + "epoch": 0.26363, + "grad_norm": 0.9364129022590112, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 26363 + }, + { + "epoch": 0.26364, + "grad_norm": 0.9307252377674574, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 26364 + }, + { + "epoch": 0.26365, + "grad_norm": 0.920306162620304, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 26365 + }, + { + "epoch": 0.26366, + "grad_norm": 0.9154178405861482, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 26366 + }, + { + "epoch": 0.26367, + "grad_norm": 0.937245208004888, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 26367 + }, + { + "epoch": 0.26368, + "grad_norm": 0.8398934953416446, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 26368 + }, + { + "epoch": 0.26369, + "grad_norm": 0.8173147407236, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 26369 + }, + { + "epoch": 0.2637, + "grad_norm": 0.7058333892613555, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 26370 + }, + { + "epoch": 0.26371, + "grad_norm": 0.6329275884277615, + "learning_rate": 0.003, + "loss": 4.037, + "step": 26371 + }, + { + "epoch": 0.26372, + "grad_norm": 0.728975976734911, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 26372 + }, + { + "epoch": 0.26373, + "grad_norm": 0.815344001364777, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 26373 + }, + { + "epoch": 0.26374, + "grad_norm": 0.9109008919062535, + "learning_rate": 0.003, + "loss": 4.0018, + "step": 26374 + }, + { + "epoch": 0.26375, + "grad_norm": 0.8803127956664503, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 26375 + }, + { + "epoch": 0.26376, + "grad_norm": 0.9407807448634652, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 26376 + }, + { + "epoch": 0.26377, + "grad_norm": 1.0657081824778805, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 26377 + }, + { + "epoch": 0.26378, + "grad_norm": 1.0181504109402226, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 26378 + }, + { + "epoch": 0.26379, + "grad_norm": 0.9214829771083111, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 26379 + }, + { + "epoch": 0.2638, + "grad_norm": 0.9342647661995772, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 26380 + }, + { + "epoch": 0.26381, + "grad_norm": 1.0947123414395898, + "learning_rate": 0.003, + "loss": 4.048, + "step": 26381 + }, + { + "epoch": 0.26382, + "grad_norm": 1.0246836464745697, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 26382 + }, + { + "epoch": 0.26383, + "grad_norm": 0.9703983020976162, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 26383 + }, + { + "epoch": 0.26384, + "grad_norm": 1.0284449878057695, + "learning_rate": 0.003, + "loss": 4.063, + "step": 26384 + }, + { + "epoch": 0.26385, + "grad_norm": 0.8717133495388695, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 26385 + }, + { + "epoch": 0.26386, + "grad_norm": 0.6963126463025701, + "learning_rate": 0.003, + "loss": 4.077, + "step": 26386 + }, + { + "epoch": 0.26387, + "grad_norm": 0.7560600662986483, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 26387 + }, + { + "epoch": 0.26388, + "grad_norm": 0.7112745384693496, + "learning_rate": 0.003, + "loss": 4.023, + "step": 26388 + }, + { + "epoch": 0.26389, + "grad_norm": 0.766828323395047, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 26389 + }, + { + "epoch": 0.2639, + "grad_norm": 0.6386782695691284, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 26390 + }, + { + "epoch": 0.26391, + "grad_norm": 0.6710948808489234, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 26391 + }, + { + "epoch": 0.26392, + "grad_norm": 0.7956219639057001, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 26392 + }, + { + "epoch": 0.26393, + "grad_norm": 0.8575451057485323, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 26393 + }, + { + "epoch": 0.26394, + "grad_norm": 0.9106084091652044, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 26394 + }, + { + "epoch": 0.26395, + "grad_norm": 1.067579032975518, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 26395 + }, + { + "epoch": 0.26396, + "grad_norm": 1.162232723376386, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 26396 + }, + { + "epoch": 0.26397, + "grad_norm": 0.9517569294306354, + "learning_rate": 0.003, + "loss": 4.052, + "step": 26397 + }, + { + "epoch": 0.26398, + "grad_norm": 0.8701270956448659, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 26398 + }, + { + "epoch": 0.26399, + "grad_norm": 0.7005957270105875, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 26399 + }, + { + "epoch": 0.264, + "grad_norm": 0.7143917165320282, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 26400 + }, + { + "epoch": 0.26401, + "grad_norm": 0.8254167017687327, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 26401 + }, + { + "epoch": 0.26402, + "grad_norm": 0.8650248959337753, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 26402 + }, + { + "epoch": 0.26403, + "grad_norm": 0.8191146834273513, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 26403 + }, + { + "epoch": 0.26404, + "grad_norm": 0.7578882041876643, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 26404 + }, + { + "epoch": 0.26405, + "grad_norm": 0.6705536445210004, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 26405 + }, + { + "epoch": 0.26406, + "grad_norm": 0.5864020268483535, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 26406 + }, + { + "epoch": 0.26407, + "grad_norm": 0.6249825674102845, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 26407 + }, + { + "epoch": 0.26408, + "grad_norm": 0.6175129399805311, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 26408 + }, + { + "epoch": 0.26409, + "grad_norm": 0.6476694851788265, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 26409 + }, + { + "epoch": 0.2641, + "grad_norm": 0.7625158024795464, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 26410 + }, + { + "epoch": 0.26411, + "grad_norm": 0.860459364848734, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 26411 + }, + { + "epoch": 0.26412, + "grad_norm": 1.0812651847513521, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 26412 + }, + { + "epoch": 0.26413, + "grad_norm": 1.0489418788409184, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 26413 + }, + { + "epoch": 0.26414, + "grad_norm": 0.7989897817720344, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 26414 + }, + { + "epoch": 0.26415, + "grad_norm": 0.8408677030070899, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 26415 + }, + { + "epoch": 0.26416, + "grad_norm": 0.9688824286156928, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 26416 + }, + { + "epoch": 0.26417, + "grad_norm": 0.8830501400318521, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 26417 + }, + { + "epoch": 0.26418, + "grad_norm": 0.8254920285957976, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 26418 + }, + { + "epoch": 0.26419, + "grad_norm": 0.7508440341993747, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 26419 + }, + { + "epoch": 0.2642, + "grad_norm": 0.7669802797847566, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 26420 + }, + { + "epoch": 0.26421, + "grad_norm": 0.8896881044472471, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 26421 + }, + { + "epoch": 0.26422, + "grad_norm": 1.1927831258741326, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 26422 + }, + { + "epoch": 0.26423, + "grad_norm": 0.9048049104330993, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 26423 + }, + { + "epoch": 0.26424, + "grad_norm": 0.982882329856891, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 26424 + }, + { + "epoch": 0.26425, + "grad_norm": 1.135873243157521, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 26425 + }, + { + "epoch": 0.26426, + "grad_norm": 1.1127915458170283, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 26426 + }, + { + "epoch": 0.26427, + "grad_norm": 1.0516026474370734, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 26427 + }, + { + "epoch": 0.26428, + "grad_norm": 1.130422845744541, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 26428 + }, + { + "epoch": 0.26429, + "grad_norm": 0.9970937210173137, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 26429 + }, + { + "epoch": 0.2643, + "grad_norm": 0.9862627116646423, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 26430 + }, + { + "epoch": 0.26431, + "grad_norm": 1.0545008421663888, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 26431 + }, + { + "epoch": 0.26432, + "grad_norm": 1.0297866480393942, + "learning_rate": 0.003, + "loss": 4.066, + "step": 26432 + }, + { + "epoch": 0.26433, + "grad_norm": 0.9480779968013529, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 26433 + }, + { + "epoch": 0.26434, + "grad_norm": 0.831788867926559, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 26434 + }, + { + "epoch": 0.26435, + "grad_norm": 0.6840378094823714, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 26435 + }, + { + "epoch": 0.26436, + "grad_norm": 0.6891648057766006, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 26436 + }, + { + "epoch": 0.26437, + "grad_norm": 0.8463507408971332, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 26437 + }, + { + "epoch": 0.26438, + "grad_norm": 1.075623620930203, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 26438 + }, + { + "epoch": 0.26439, + "grad_norm": 1.0543542680845774, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 26439 + }, + { + "epoch": 0.2644, + "grad_norm": 0.8031729208632075, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 26440 + }, + { + "epoch": 0.26441, + "grad_norm": 0.6550500802019965, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 26441 + }, + { + "epoch": 0.26442, + "grad_norm": 0.7573350753134994, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 26442 + }, + { + "epoch": 0.26443, + "grad_norm": 0.8889633407157393, + "learning_rate": 0.003, + "loss": 4.057, + "step": 26443 + }, + { + "epoch": 0.26444, + "grad_norm": 0.8955200498594348, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 26444 + }, + { + "epoch": 0.26445, + "grad_norm": 0.9411621158959186, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 26445 + }, + { + "epoch": 0.26446, + "grad_norm": 0.9985013275378802, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 26446 + }, + { + "epoch": 0.26447, + "grad_norm": 0.9707548495962474, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 26447 + }, + { + "epoch": 0.26448, + "grad_norm": 0.9277198050581533, + "learning_rate": 0.003, + "loss": 4.037, + "step": 26448 + }, + { + "epoch": 0.26449, + "grad_norm": 0.9758363009872599, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 26449 + }, + { + "epoch": 0.2645, + "grad_norm": 1.0001416446769837, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 26450 + }, + { + "epoch": 0.26451, + "grad_norm": 0.9800255214045345, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 26451 + }, + { + "epoch": 0.26452, + "grad_norm": 0.8565490459972365, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 26452 + }, + { + "epoch": 0.26453, + "grad_norm": 0.9113251032337254, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 26453 + }, + { + "epoch": 0.26454, + "grad_norm": 0.8389117178014288, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 26454 + }, + { + "epoch": 0.26455, + "grad_norm": 0.8331910636525025, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 26455 + }, + { + "epoch": 0.26456, + "grad_norm": 0.988341975817765, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 26456 + }, + { + "epoch": 0.26457, + "grad_norm": 1.0997198014297673, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 26457 + }, + { + "epoch": 0.26458, + "grad_norm": 0.8729624608622639, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 26458 + }, + { + "epoch": 0.26459, + "grad_norm": 0.7501123040205954, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 26459 + }, + { + "epoch": 0.2646, + "grad_norm": 0.7474377281299847, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 26460 + }, + { + "epoch": 0.26461, + "grad_norm": 0.7592649104106839, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 26461 + }, + { + "epoch": 0.26462, + "grad_norm": 0.6984067964799132, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 26462 + }, + { + "epoch": 0.26463, + "grad_norm": 0.7209472962451595, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 26463 + }, + { + "epoch": 0.26464, + "grad_norm": 0.7756276331832247, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 26464 + }, + { + "epoch": 0.26465, + "grad_norm": 0.8138205000370119, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 26465 + }, + { + "epoch": 0.26466, + "grad_norm": 0.9047164533691512, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 26466 + }, + { + "epoch": 0.26467, + "grad_norm": 0.9716370291731774, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 26467 + }, + { + "epoch": 0.26468, + "grad_norm": 0.9335191023201734, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 26468 + }, + { + "epoch": 0.26469, + "grad_norm": 0.8486803427402103, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 26469 + }, + { + "epoch": 0.2647, + "grad_norm": 0.8864848257748309, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 26470 + }, + { + "epoch": 0.26471, + "grad_norm": 0.7726791846119326, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 26471 + }, + { + "epoch": 0.26472, + "grad_norm": 0.7895070265189678, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 26472 + }, + { + "epoch": 0.26473, + "grad_norm": 0.8065934503354619, + "learning_rate": 0.003, + "loss": 4.016, + "step": 26473 + }, + { + "epoch": 0.26474, + "grad_norm": 0.8564721178785649, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 26474 + }, + { + "epoch": 0.26475, + "grad_norm": 1.1012056109420556, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 26475 + }, + { + "epoch": 0.26476, + "grad_norm": 0.9574911297667191, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 26476 + }, + { + "epoch": 0.26477, + "grad_norm": 0.784224530597972, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 26477 + }, + { + "epoch": 0.26478, + "grad_norm": 0.7366702338549552, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 26478 + }, + { + "epoch": 0.26479, + "grad_norm": 0.7600814438220768, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 26479 + }, + { + "epoch": 0.2648, + "grad_norm": 0.6873411325365798, + "learning_rate": 0.003, + "loss": 4.013, + "step": 26480 + }, + { + "epoch": 0.26481, + "grad_norm": 0.8400139834751884, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 26481 + }, + { + "epoch": 0.26482, + "grad_norm": 0.9999519491565146, + "learning_rate": 0.003, + "loss": 4.048, + "step": 26482 + }, + { + "epoch": 0.26483, + "grad_norm": 1.1854447759389781, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 26483 + }, + { + "epoch": 0.26484, + "grad_norm": 1.0205160308075745, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 26484 + }, + { + "epoch": 0.26485, + "grad_norm": 1.1415264366021833, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 26485 + }, + { + "epoch": 0.26486, + "grad_norm": 0.8793647690282758, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 26486 + }, + { + "epoch": 0.26487, + "grad_norm": 0.8017486036852204, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 26487 + }, + { + "epoch": 0.26488, + "grad_norm": 0.8008961164999143, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 26488 + }, + { + "epoch": 0.26489, + "grad_norm": 0.6818561500421075, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 26489 + }, + { + "epoch": 0.2649, + "grad_norm": 0.6749483346555479, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 26490 + }, + { + "epoch": 0.26491, + "grad_norm": 0.7011862927103796, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 26491 + }, + { + "epoch": 0.26492, + "grad_norm": 0.6877318658292878, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 26492 + }, + { + "epoch": 0.26493, + "grad_norm": 0.6786471718017335, + "learning_rate": 0.003, + "loss": 4.041, + "step": 26493 + }, + { + "epoch": 0.26494, + "grad_norm": 0.735619168290087, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 26494 + }, + { + "epoch": 0.26495, + "grad_norm": 0.9610149087782505, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 26495 + }, + { + "epoch": 0.26496, + "grad_norm": 1.056513198255653, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 26496 + }, + { + "epoch": 0.26497, + "grad_norm": 0.9183423889386664, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 26497 + }, + { + "epoch": 0.26498, + "grad_norm": 0.9464937449545988, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 26498 + }, + { + "epoch": 0.26499, + "grad_norm": 0.9812872537939151, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 26499 + }, + { + "epoch": 0.265, + "grad_norm": 1.0155022739525765, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 26500 + }, + { + "epoch": 0.26501, + "grad_norm": 0.9136252388732765, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 26501 + }, + { + "epoch": 0.26502, + "grad_norm": 0.8765427392869011, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 26502 + }, + { + "epoch": 0.26503, + "grad_norm": 0.8466278537553633, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 26503 + }, + { + "epoch": 0.26504, + "grad_norm": 0.7211701668090768, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 26504 + }, + { + "epoch": 0.26505, + "grad_norm": 0.709792348323559, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 26505 + }, + { + "epoch": 0.26506, + "grad_norm": 0.7034961963687268, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 26506 + }, + { + "epoch": 0.26507, + "grad_norm": 0.6790286507036759, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 26507 + }, + { + "epoch": 0.26508, + "grad_norm": 0.700781624201002, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 26508 + }, + { + "epoch": 0.26509, + "grad_norm": 0.7503016717391072, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 26509 + }, + { + "epoch": 0.2651, + "grad_norm": 0.784468647487236, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 26510 + }, + { + "epoch": 0.26511, + "grad_norm": 0.7751826820267655, + "learning_rate": 0.003, + "loss": 4.0025, + "step": 26511 + }, + { + "epoch": 0.26512, + "grad_norm": 0.8780569885062741, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 26512 + }, + { + "epoch": 0.26513, + "grad_norm": 1.085374624179859, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 26513 + }, + { + "epoch": 0.26514, + "grad_norm": 1.1160428927385875, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 26514 + }, + { + "epoch": 0.26515, + "grad_norm": 0.9932386606909059, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 26515 + }, + { + "epoch": 0.26516, + "grad_norm": 1.0655882167144841, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 26516 + }, + { + "epoch": 0.26517, + "grad_norm": 0.9838914878374098, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 26517 + }, + { + "epoch": 0.26518, + "grad_norm": 0.8094547759809417, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 26518 + }, + { + "epoch": 0.26519, + "grad_norm": 0.8420611730861844, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 26519 + }, + { + "epoch": 0.2652, + "grad_norm": 0.8634655364182763, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 26520 + }, + { + "epoch": 0.26521, + "grad_norm": 0.7959372676856116, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 26521 + }, + { + "epoch": 0.26522, + "grad_norm": 0.8248503569057528, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 26522 + }, + { + "epoch": 0.26523, + "grad_norm": 0.8965138341376468, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 26523 + }, + { + "epoch": 0.26524, + "grad_norm": 0.9519387129918376, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 26524 + }, + { + "epoch": 0.26525, + "grad_norm": 1.0522508229583183, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 26525 + }, + { + "epoch": 0.26526, + "grad_norm": 1.0568340427509701, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 26526 + }, + { + "epoch": 0.26527, + "grad_norm": 0.7951132429805443, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 26527 + }, + { + "epoch": 0.26528, + "grad_norm": 0.6629097337672287, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 26528 + }, + { + "epoch": 0.26529, + "grad_norm": 0.693422785260737, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 26529 + }, + { + "epoch": 0.2653, + "grad_norm": 0.8102985054367646, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 26530 + }, + { + "epoch": 0.26531, + "grad_norm": 1.0457243342227878, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 26531 + }, + { + "epoch": 0.26532, + "grad_norm": 1.1034054100830928, + "learning_rate": 0.003, + "loss": 4.035, + "step": 26532 + }, + { + "epoch": 0.26533, + "grad_norm": 0.8492093041777092, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 26533 + }, + { + "epoch": 0.26534, + "grad_norm": 0.7864015775622379, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 26534 + }, + { + "epoch": 0.26535, + "grad_norm": 0.8570386894433144, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 26535 + }, + { + "epoch": 0.26536, + "grad_norm": 0.8470269466946738, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 26536 + }, + { + "epoch": 0.26537, + "grad_norm": 0.8912428953016255, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 26537 + }, + { + "epoch": 0.26538, + "grad_norm": 1.0054688366721016, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 26538 + }, + { + "epoch": 0.26539, + "grad_norm": 0.9994619654800855, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 26539 + }, + { + "epoch": 0.2654, + "grad_norm": 1.0291891437504905, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 26540 + }, + { + "epoch": 0.26541, + "grad_norm": 0.9449779482929114, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 26541 + }, + { + "epoch": 0.26542, + "grad_norm": 0.8208802094563792, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 26542 + }, + { + "epoch": 0.26543, + "grad_norm": 0.7685638652637271, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 26543 + }, + { + "epoch": 0.26544, + "grad_norm": 0.8094918722457962, + "learning_rate": 0.003, + "loss": 4.083, + "step": 26544 + }, + { + "epoch": 0.26545, + "grad_norm": 0.7840593683450747, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 26545 + }, + { + "epoch": 0.26546, + "grad_norm": 0.9234567777721385, + "learning_rate": 0.003, + "loss": 4.045, + "step": 26546 + }, + { + "epoch": 0.26547, + "grad_norm": 0.9856434068206794, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 26547 + }, + { + "epoch": 0.26548, + "grad_norm": 1.135634091543436, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 26548 + }, + { + "epoch": 0.26549, + "grad_norm": 1.0180052598127605, + "learning_rate": 0.003, + "loss": 4.033, + "step": 26549 + }, + { + "epoch": 0.2655, + "grad_norm": 0.9917420436301264, + "learning_rate": 0.003, + "loss": 4.044, + "step": 26550 + }, + { + "epoch": 0.26551, + "grad_norm": 0.9076913636791657, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 26551 + }, + { + "epoch": 0.26552, + "grad_norm": 0.9014803711508053, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 26552 + }, + { + "epoch": 0.26553, + "grad_norm": 1.0031744020518278, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 26553 + }, + { + "epoch": 0.26554, + "grad_norm": 1.0310122879897317, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 26554 + }, + { + "epoch": 0.26555, + "grad_norm": 0.8630879336333662, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 26555 + }, + { + "epoch": 0.26556, + "grad_norm": 0.8482757208657407, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 26556 + }, + { + "epoch": 0.26557, + "grad_norm": 0.7921461539860246, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 26557 + }, + { + "epoch": 0.26558, + "grad_norm": 0.7062404825350208, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 26558 + }, + { + "epoch": 0.26559, + "grad_norm": 0.6434779495896984, + "learning_rate": 0.003, + "loss": 4.0101, + "step": 26559 + }, + { + "epoch": 0.2656, + "grad_norm": 0.6294746359856249, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 26560 + }, + { + "epoch": 0.26561, + "grad_norm": 0.6249647283990358, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 26561 + }, + { + "epoch": 0.26562, + "grad_norm": 0.6582266833177596, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 26562 + }, + { + "epoch": 0.26563, + "grad_norm": 0.8005843260633685, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 26563 + }, + { + "epoch": 0.26564, + "grad_norm": 1.0339601551323987, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 26564 + }, + { + "epoch": 0.26565, + "grad_norm": 1.1444726807610428, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 26565 + }, + { + "epoch": 0.26566, + "grad_norm": 0.7976268976031328, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 26566 + }, + { + "epoch": 0.26567, + "grad_norm": 0.7478091974341207, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 26567 + }, + { + "epoch": 0.26568, + "grad_norm": 0.7015358804726706, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 26568 + }, + { + "epoch": 0.26569, + "grad_norm": 0.6686937589930685, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 26569 + }, + { + "epoch": 0.2657, + "grad_norm": 0.6293193000193683, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 26570 + }, + { + "epoch": 0.26571, + "grad_norm": 0.5893256276572801, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 26571 + }, + { + "epoch": 0.26572, + "grad_norm": 0.6267665797984254, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 26572 + }, + { + "epoch": 0.26573, + "grad_norm": 0.7180731884210094, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 26573 + }, + { + "epoch": 0.26574, + "grad_norm": 0.7918770362948485, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 26574 + }, + { + "epoch": 0.26575, + "grad_norm": 0.8707971390235899, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 26575 + }, + { + "epoch": 0.26576, + "grad_norm": 1.0742671391332812, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 26576 + }, + { + "epoch": 0.26577, + "grad_norm": 1.059372407254524, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 26577 + }, + { + "epoch": 0.26578, + "grad_norm": 1.1173540771526385, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 26578 + }, + { + "epoch": 0.26579, + "grad_norm": 0.9334784311256933, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 26579 + }, + { + "epoch": 0.2658, + "grad_norm": 0.912483535758802, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 26580 + }, + { + "epoch": 0.26581, + "grad_norm": 0.9360021821549197, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 26581 + }, + { + "epoch": 0.26582, + "grad_norm": 0.917177744568296, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 26582 + }, + { + "epoch": 0.26583, + "grad_norm": 0.901200366271798, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 26583 + }, + { + "epoch": 0.26584, + "grad_norm": 0.9932961224541949, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 26584 + }, + { + "epoch": 0.26585, + "grad_norm": 0.7550484216826971, + "learning_rate": 0.003, + "loss": 4.059, + "step": 26585 + }, + { + "epoch": 0.26586, + "grad_norm": 0.6904213183744595, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 26586 + }, + { + "epoch": 0.26587, + "grad_norm": 0.7900128851326818, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 26587 + }, + { + "epoch": 0.26588, + "grad_norm": 0.8212886738583991, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 26588 + }, + { + "epoch": 0.26589, + "grad_norm": 0.8057590103896375, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 26589 + }, + { + "epoch": 0.2659, + "grad_norm": 0.8719420126745442, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 26590 + }, + { + "epoch": 0.26591, + "grad_norm": 1.0644395475080315, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 26591 + }, + { + "epoch": 0.26592, + "grad_norm": 1.0969912106384265, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 26592 + }, + { + "epoch": 0.26593, + "grad_norm": 1.0004021089407946, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 26593 + }, + { + "epoch": 0.26594, + "grad_norm": 1.1891625980656515, + "learning_rate": 0.003, + "loss": 4.046, + "step": 26594 + }, + { + "epoch": 0.26595, + "grad_norm": 1.0830354449787332, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 26595 + }, + { + "epoch": 0.26596, + "grad_norm": 0.8559300310113501, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 26596 + }, + { + "epoch": 0.26597, + "grad_norm": 0.7465984818710449, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 26597 + }, + { + "epoch": 0.26598, + "grad_norm": 0.6944511962179045, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 26598 + }, + { + "epoch": 0.26599, + "grad_norm": 0.6569263790868736, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 26599 + }, + { + "epoch": 0.266, + "grad_norm": 0.6727183475044749, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 26600 + }, + { + "epoch": 0.26601, + "grad_norm": 0.7363523814241277, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 26601 + }, + { + "epoch": 0.26602, + "grad_norm": 0.8898304489470836, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 26602 + }, + { + "epoch": 0.26603, + "grad_norm": 1.1792337032717648, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 26603 + }, + { + "epoch": 0.26604, + "grad_norm": 0.9382278959328094, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 26604 + }, + { + "epoch": 0.26605, + "grad_norm": 0.7412230866211287, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 26605 + }, + { + "epoch": 0.26606, + "grad_norm": 0.7061273718104436, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 26606 + }, + { + "epoch": 0.26607, + "grad_norm": 0.6419652278942862, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 26607 + }, + { + "epoch": 0.26608, + "grad_norm": 0.624108405526765, + "learning_rate": 0.003, + "loss": 4.087, + "step": 26608 + }, + { + "epoch": 0.26609, + "grad_norm": 0.7260998774620782, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 26609 + }, + { + "epoch": 0.2661, + "grad_norm": 0.8414296134654446, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 26610 + }, + { + "epoch": 0.26611, + "grad_norm": 1.0252421096378903, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 26611 + }, + { + "epoch": 0.26612, + "grad_norm": 0.972697906522464, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 26612 + }, + { + "epoch": 0.26613, + "grad_norm": 0.963674305976449, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 26613 + }, + { + "epoch": 0.26614, + "grad_norm": 1.0442225451968776, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 26614 + }, + { + "epoch": 0.26615, + "grad_norm": 0.9154175965331495, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 26615 + }, + { + "epoch": 0.26616, + "grad_norm": 0.6497602554719503, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 26616 + }, + { + "epoch": 0.26617, + "grad_norm": 0.6361910625565259, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 26617 + }, + { + "epoch": 0.26618, + "grad_norm": 0.8205470989904912, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 26618 + }, + { + "epoch": 0.26619, + "grad_norm": 0.9543037316112705, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 26619 + }, + { + "epoch": 0.2662, + "grad_norm": 1.1498093701608683, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 26620 + }, + { + "epoch": 0.26621, + "grad_norm": 1.0314979822258623, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 26621 + }, + { + "epoch": 0.26622, + "grad_norm": 1.0414088383615803, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 26622 + }, + { + "epoch": 0.26623, + "grad_norm": 0.9187221942093965, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 26623 + }, + { + "epoch": 0.26624, + "grad_norm": 0.8826723217229712, + "learning_rate": 0.003, + "loss": 4.066, + "step": 26624 + }, + { + "epoch": 0.26625, + "grad_norm": 1.010207166660528, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 26625 + }, + { + "epoch": 0.26626, + "grad_norm": 0.9263115765655748, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 26626 + }, + { + "epoch": 0.26627, + "grad_norm": 0.8464879776659414, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 26627 + }, + { + "epoch": 0.26628, + "grad_norm": 0.8312012033056366, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 26628 + }, + { + "epoch": 0.26629, + "grad_norm": 0.8650003532080943, + "learning_rate": 0.003, + "loss": 4.05, + "step": 26629 + }, + { + "epoch": 0.2663, + "grad_norm": 0.9670055546956314, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 26630 + }, + { + "epoch": 0.26631, + "grad_norm": 1.2759445776401157, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 26631 + }, + { + "epoch": 0.26632, + "grad_norm": 1.0237304026362863, + "learning_rate": 0.003, + "loss": 4.1034, + "step": 26632 + }, + { + "epoch": 0.26633, + "grad_norm": 1.0414362802994714, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 26633 + }, + { + "epoch": 0.26634, + "grad_norm": 1.1134138074174076, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 26634 + }, + { + "epoch": 0.26635, + "grad_norm": 1.0725962625003227, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 26635 + }, + { + "epoch": 0.26636, + "grad_norm": 1.1260254195629908, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 26636 + }, + { + "epoch": 0.26637, + "grad_norm": 0.9208476223812743, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 26637 + }, + { + "epoch": 0.26638, + "grad_norm": 0.8533590723517842, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 26638 + }, + { + "epoch": 0.26639, + "grad_norm": 0.8689167892321508, + "learning_rate": 0.003, + "loss": 4.097, + "step": 26639 + }, + { + "epoch": 0.2664, + "grad_norm": 0.7972294977074371, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 26640 + }, + { + "epoch": 0.26641, + "grad_norm": 0.7861236885164111, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 26641 + }, + { + "epoch": 0.26642, + "grad_norm": 0.9182536278111603, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 26642 + }, + { + "epoch": 0.26643, + "grad_norm": 0.9651958892753376, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 26643 + }, + { + "epoch": 0.26644, + "grad_norm": 0.9055922456649463, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 26644 + }, + { + "epoch": 0.26645, + "grad_norm": 0.7906809965198481, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 26645 + }, + { + "epoch": 0.26646, + "grad_norm": 0.7125052906612545, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 26646 + }, + { + "epoch": 0.26647, + "grad_norm": 0.7732075907172865, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 26647 + }, + { + "epoch": 0.26648, + "grad_norm": 0.8617790578171289, + "learning_rate": 0.003, + "loss": 4.073, + "step": 26648 + }, + { + "epoch": 0.26649, + "grad_norm": 0.8229307983839296, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 26649 + }, + { + "epoch": 0.2665, + "grad_norm": 0.802973432232185, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 26650 + }, + { + "epoch": 0.26651, + "grad_norm": 0.7832406968666211, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 26651 + }, + { + "epoch": 0.26652, + "grad_norm": 0.8143777204867714, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 26652 + }, + { + "epoch": 0.26653, + "grad_norm": 0.8553751372748348, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 26653 + }, + { + "epoch": 0.26654, + "grad_norm": 0.8902401302342804, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 26654 + }, + { + "epoch": 0.26655, + "grad_norm": 0.8658398350047829, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 26655 + }, + { + "epoch": 0.26656, + "grad_norm": 0.7817725862644421, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 26656 + }, + { + "epoch": 0.26657, + "grad_norm": 0.7151587655078991, + "learning_rate": 0.003, + "loss": 4.023, + "step": 26657 + }, + { + "epoch": 0.26658, + "grad_norm": 0.7822227505750222, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 26658 + }, + { + "epoch": 0.26659, + "grad_norm": 0.85944374577339, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 26659 + }, + { + "epoch": 0.2666, + "grad_norm": 1.0289665509090975, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 26660 + }, + { + "epoch": 0.26661, + "grad_norm": 1.2521364519932363, + "learning_rate": 0.003, + "loss": 4.061, + "step": 26661 + }, + { + "epoch": 0.26662, + "grad_norm": 0.819131146464348, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 26662 + }, + { + "epoch": 0.26663, + "grad_norm": 0.7119951588668726, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 26663 + }, + { + "epoch": 0.26664, + "grad_norm": 0.728751474300894, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 26664 + }, + { + "epoch": 0.26665, + "grad_norm": 0.7264442585143859, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 26665 + }, + { + "epoch": 0.26666, + "grad_norm": 0.7460528191940624, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 26666 + }, + { + "epoch": 0.26667, + "grad_norm": 0.6649698464189977, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 26667 + }, + { + "epoch": 0.26668, + "grad_norm": 0.5378801838459671, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 26668 + }, + { + "epoch": 0.26669, + "grad_norm": 0.5632099155812313, + "learning_rate": 0.003, + "loss": 4.034, + "step": 26669 + }, + { + "epoch": 0.2667, + "grad_norm": 0.5350368434627366, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 26670 + }, + { + "epoch": 0.26671, + "grad_norm": 0.5303576151520706, + "learning_rate": 0.003, + "loss": 4.0039, + "step": 26671 + }, + { + "epoch": 0.26672, + "grad_norm": 0.6109151039924823, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 26672 + }, + { + "epoch": 0.26673, + "grad_norm": 0.796179812841856, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 26673 + }, + { + "epoch": 0.26674, + "grad_norm": 1.0923526359029165, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 26674 + }, + { + "epoch": 0.26675, + "grad_norm": 1.0762238704000684, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 26675 + }, + { + "epoch": 0.26676, + "grad_norm": 1.0557974279745912, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 26676 + }, + { + "epoch": 0.26677, + "grad_norm": 0.9822702705691877, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 26677 + }, + { + "epoch": 0.26678, + "grad_norm": 1.0966666886212464, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 26678 + }, + { + "epoch": 0.26679, + "grad_norm": 0.7858873010454793, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 26679 + }, + { + "epoch": 0.2668, + "grad_norm": 0.6076935779388353, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 26680 + }, + { + "epoch": 0.26681, + "grad_norm": 0.6412059021445111, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 26681 + }, + { + "epoch": 0.26682, + "grad_norm": 0.8000140632199488, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 26682 + }, + { + "epoch": 0.26683, + "grad_norm": 0.912916014743871, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 26683 + }, + { + "epoch": 0.26684, + "grad_norm": 0.9923446803688599, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 26684 + }, + { + "epoch": 0.26685, + "grad_norm": 1.207405723846636, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 26685 + }, + { + "epoch": 0.26686, + "grad_norm": 0.8520035522621076, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 26686 + }, + { + "epoch": 0.26687, + "grad_norm": 0.8113852889498812, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 26687 + }, + { + "epoch": 0.26688, + "grad_norm": 0.9038452044667784, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 26688 + }, + { + "epoch": 0.26689, + "grad_norm": 0.9815309739157995, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 26689 + }, + { + "epoch": 0.2669, + "grad_norm": 1.0388893042418477, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 26690 + }, + { + "epoch": 0.26691, + "grad_norm": 0.9291636438353094, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 26691 + }, + { + "epoch": 0.26692, + "grad_norm": 0.9440179886077221, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 26692 + }, + { + "epoch": 0.26693, + "grad_norm": 1.0742388681689106, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 26693 + }, + { + "epoch": 0.26694, + "grad_norm": 0.9951746493562946, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 26694 + }, + { + "epoch": 0.26695, + "grad_norm": 0.8817734937788403, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 26695 + }, + { + "epoch": 0.26696, + "grad_norm": 0.860687491951749, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 26696 + }, + { + "epoch": 0.26697, + "grad_norm": 1.0688683442937588, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 26697 + }, + { + "epoch": 0.26698, + "grad_norm": 0.9522765700180879, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 26698 + }, + { + "epoch": 0.26699, + "grad_norm": 0.9027802009966653, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 26699 + }, + { + "epoch": 0.267, + "grad_norm": 1.0561183837657433, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 26700 + }, + { + "epoch": 0.26701, + "grad_norm": 1.0675308573497253, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 26701 + }, + { + "epoch": 0.26702, + "grad_norm": 1.028566822243012, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 26702 + }, + { + "epoch": 0.26703, + "grad_norm": 0.9313649923505639, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 26703 + }, + { + "epoch": 0.26704, + "grad_norm": 0.785645078280347, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 26704 + }, + { + "epoch": 0.26705, + "grad_norm": 0.6892208234298768, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 26705 + }, + { + "epoch": 0.26706, + "grad_norm": 0.5772106607160312, + "learning_rate": 0.003, + "loss": 4.044, + "step": 26706 + }, + { + "epoch": 0.26707, + "grad_norm": 0.6333217733057827, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 26707 + }, + { + "epoch": 0.26708, + "grad_norm": 0.7968952712638063, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 26708 + }, + { + "epoch": 0.26709, + "grad_norm": 0.992929318826041, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 26709 + }, + { + "epoch": 0.2671, + "grad_norm": 1.0900549571691964, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 26710 + }, + { + "epoch": 0.26711, + "grad_norm": 0.8046750135462912, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 26711 + }, + { + "epoch": 0.26712, + "grad_norm": 0.8794702623575531, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 26712 + }, + { + "epoch": 0.26713, + "grad_norm": 0.895768487197142, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 26713 + }, + { + "epoch": 0.26714, + "grad_norm": 0.7857887607120044, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 26714 + }, + { + "epoch": 0.26715, + "grad_norm": 0.8334878305798049, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 26715 + }, + { + "epoch": 0.26716, + "grad_norm": 0.9199768839994255, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 26716 + }, + { + "epoch": 0.26717, + "grad_norm": 0.9510913854053771, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 26717 + }, + { + "epoch": 0.26718, + "grad_norm": 0.9733343841124151, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 26718 + }, + { + "epoch": 0.26719, + "grad_norm": 0.9300557931902645, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 26719 + }, + { + "epoch": 0.2672, + "grad_norm": 0.8673640555025532, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 26720 + }, + { + "epoch": 0.26721, + "grad_norm": 0.7590328475857064, + "learning_rate": 0.003, + "loss": 4.025, + "step": 26721 + }, + { + "epoch": 0.26722, + "grad_norm": 0.85164985943911, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 26722 + }, + { + "epoch": 0.26723, + "grad_norm": 0.8797060503741002, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 26723 + }, + { + "epoch": 0.26724, + "grad_norm": 1.0742166966975317, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 26724 + }, + { + "epoch": 0.26725, + "grad_norm": 1.0109969540785737, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 26725 + }, + { + "epoch": 0.26726, + "grad_norm": 1.0907737540438218, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 26726 + }, + { + "epoch": 0.26727, + "grad_norm": 0.9942324362436002, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 26727 + }, + { + "epoch": 0.26728, + "grad_norm": 0.8371243951848143, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 26728 + }, + { + "epoch": 0.26729, + "grad_norm": 0.6872804004639314, + "learning_rate": 0.003, + "loss": 4.06, + "step": 26729 + }, + { + "epoch": 0.2673, + "grad_norm": 0.591674397944938, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 26730 + }, + { + "epoch": 0.26731, + "grad_norm": 0.6606672414326421, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 26731 + }, + { + "epoch": 0.26732, + "grad_norm": 0.6572819178724009, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 26732 + }, + { + "epoch": 0.26733, + "grad_norm": 0.6940561532330184, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 26733 + }, + { + "epoch": 0.26734, + "grad_norm": 0.6965391264343574, + "learning_rate": 0.003, + "loss": 4.0037, + "step": 26734 + }, + { + "epoch": 0.26735, + "grad_norm": 0.7501205854225137, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 26735 + }, + { + "epoch": 0.26736, + "grad_norm": 0.849406258319405, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 26736 + }, + { + "epoch": 0.26737, + "grad_norm": 0.8531018082178712, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 26737 + }, + { + "epoch": 0.26738, + "grad_norm": 0.9511979581549274, + "learning_rate": 0.003, + "loss": 4.052, + "step": 26738 + }, + { + "epoch": 0.26739, + "grad_norm": 1.2107113828172498, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 26739 + }, + { + "epoch": 0.2674, + "grad_norm": 0.8612989313071496, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 26740 + }, + { + "epoch": 0.26741, + "grad_norm": 0.7283285608965303, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 26741 + }, + { + "epoch": 0.26742, + "grad_norm": 0.7379148522520755, + "learning_rate": 0.003, + "loss": 4.039, + "step": 26742 + }, + { + "epoch": 0.26743, + "grad_norm": 0.7174040269715107, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 26743 + }, + { + "epoch": 0.26744, + "grad_norm": 0.8308836330838764, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 26744 + }, + { + "epoch": 0.26745, + "grad_norm": 0.8196239611600561, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 26745 + }, + { + "epoch": 0.26746, + "grad_norm": 0.8682085238476234, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 26746 + }, + { + "epoch": 0.26747, + "grad_norm": 0.9603288625812608, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 26747 + }, + { + "epoch": 0.26748, + "grad_norm": 1.0198777595068194, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 26748 + }, + { + "epoch": 0.26749, + "grad_norm": 1.2845640757730779, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 26749 + }, + { + "epoch": 0.2675, + "grad_norm": 1.032885193362763, + "learning_rate": 0.003, + "loss": 4.047, + "step": 26750 + }, + { + "epoch": 0.26751, + "grad_norm": 1.1613661321720825, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 26751 + }, + { + "epoch": 0.26752, + "grad_norm": 0.8530085664308429, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 26752 + }, + { + "epoch": 0.26753, + "grad_norm": 0.7014666181493231, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 26753 + }, + { + "epoch": 0.26754, + "grad_norm": 0.6796645147335215, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 26754 + }, + { + "epoch": 0.26755, + "grad_norm": 0.6562142494094009, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 26755 + }, + { + "epoch": 0.26756, + "grad_norm": 0.6026007136599633, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 26756 + }, + { + "epoch": 0.26757, + "grad_norm": 0.5699672937551186, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 26757 + }, + { + "epoch": 0.26758, + "grad_norm": 0.576757488510103, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 26758 + }, + { + "epoch": 0.26759, + "grad_norm": 0.6704464721641104, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 26759 + }, + { + "epoch": 0.2676, + "grad_norm": 0.9361903069061588, + "learning_rate": 0.003, + "loss": 4.046, + "step": 26760 + }, + { + "epoch": 0.26761, + "grad_norm": 1.2288749481239083, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 26761 + }, + { + "epoch": 0.26762, + "grad_norm": 0.9443992511767147, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 26762 + }, + { + "epoch": 0.26763, + "grad_norm": 0.9785956628929858, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 26763 + }, + { + "epoch": 0.26764, + "grad_norm": 0.9163626702234071, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 26764 + }, + { + "epoch": 0.26765, + "grad_norm": 0.840137909902933, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 26765 + }, + { + "epoch": 0.26766, + "grad_norm": 0.7684002808570489, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 26766 + }, + { + "epoch": 0.26767, + "grad_norm": 0.7570456762377298, + "learning_rate": 0.003, + "loss": 4.0111, + "step": 26767 + }, + { + "epoch": 0.26768, + "grad_norm": 0.7883064882330026, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 26768 + }, + { + "epoch": 0.26769, + "grad_norm": 0.7510368380664076, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 26769 + }, + { + "epoch": 0.2677, + "grad_norm": 0.8714694913319477, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 26770 + }, + { + "epoch": 0.26771, + "grad_norm": 0.9931551188063039, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 26771 + }, + { + "epoch": 0.26772, + "grad_norm": 0.8787342561461627, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 26772 + }, + { + "epoch": 0.26773, + "grad_norm": 0.8621318653400925, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 26773 + }, + { + "epoch": 0.26774, + "grad_norm": 0.8325085381923846, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 26774 + }, + { + "epoch": 0.26775, + "grad_norm": 0.9047825263675746, + "learning_rate": 0.003, + "loss": 4.057, + "step": 26775 + }, + { + "epoch": 0.26776, + "grad_norm": 1.1501523171767354, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 26776 + }, + { + "epoch": 0.26777, + "grad_norm": 1.269977820533238, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 26777 + }, + { + "epoch": 0.26778, + "grad_norm": 0.8090218286372404, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 26778 + }, + { + "epoch": 0.26779, + "grad_norm": 0.5805851621821513, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 26779 + }, + { + "epoch": 0.2678, + "grad_norm": 0.6929948907046206, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 26780 + }, + { + "epoch": 0.26781, + "grad_norm": 0.8412630997929835, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 26781 + }, + { + "epoch": 0.26782, + "grad_norm": 1.0243521372555395, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 26782 + }, + { + "epoch": 0.26783, + "grad_norm": 1.0856315744361238, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 26783 + }, + { + "epoch": 0.26784, + "grad_norm": 0.8348121730140173, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 26784 + }, + { + "epoch": 0.26785, + "grad_norm": 0.7378561634094751, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 26785 + }, + { + "epoch": 0.26786, + "grad_norm": 0.8226386741847183, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 26786 + }, + { + "epoch": 0.26787, + "grad_norm": 0.8791824139867304, + "learning_rate": 0.003, + "loss": 4.047, + "step": 26787 + }, + { + "epoch": 0.26788, + "grad_norm": 0.9121155198908483, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 26788 + }, + { + "epoch": 0.26789, + "grad_norm": 1.0041298023638419, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 26789 + }, + { + "epoch": 0.2679, + "grad_norm": 1.154655682264059, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 26790 + }, + { + "epoch": 0.26791, + "grad_norm": 0.7453429369776192, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 26791 + }, + { + "epoch": 0.26792, + "grad_norm": 0.7944509654961418, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 26792 + }, + { + "epoch": 0.26793, + "grad_norm": 0.8993588114106738, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 26793 + }, + { + "epoch": 0.26794, + "grad_norm": 0.9500213961892805, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 26794 + }, + { + "epoch": 0.26795, + "grad_norm": 1.0357962594391055, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 26795 + }, + { + "epoch": 0.26796, + "grad_norm": 0.8152045205170331, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 26796 + }, + { + "epoch": 0.26797, + "grad_norm": 0.7699667054161323, + "learning_rate": 0.003, + "loss": 4.056, + "step": 26797 + }, + { + "epoch": 0.26798, + "grad_norm": 0.7538410406382684, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 26798 + }, + { + "epoch": 0.26799, + "grad_norm": 0.8229955553368276, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 26799 + }, + { + "epoch": 0.268, + "grad_norm": 0.9507927465758806, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 26800 + }, + { + "epoch": 0.26801, + "grad_norm": 1.061773974768729, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 26801 + }, + { + "epoch": 0.26802, + "grad_norm": 0.9851158659326185, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 26802 + }, + { + "epoch": 0.26803, + "grad_norm": 0.9839605172370657, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 26803 + }, + { + "epoch": 0.26804, + "grad_norm": 1.1199766359260959, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 26804 + }, + { + "epoch": 0.26805, + "grad_norm": 0.9321880203486483, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 26805 + }, + { + "epoch": 0.26806, + "grad_norm": 0.7683828914268355, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 26806 + }, + { + "epoch": 0.26807, + "grad_norm": 0.7264836985086381, + "learning_rate": 0.003, + "loss": 4.033, + "step": 26807 + }, + { + "epoch": 0.26808, + "grad_norm": 0.6947152787508986, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 26808 + }, + { + "epoch": 0.26809, + "grad_norm": 0.7887681249681788, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 26809 + }, + { + "epoch": 0.2681, + "grad_norm": 0.7766616004275769, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 26810 + }, + { + "epoch": 0.26811, + "grad_norm": 0.8162115777546617, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 26811 + }, + { + "epoch": 0.26812, + "grad_norm": 1.00836242098642, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 26812 + }, + { + "epoch": 0.26813, + "grad_norm": 1.1787009291832173, + "learning_rate": 0.003, + "loss": 4.064, + "step": 26813 + }, + { + "epoch": 0.26814, + "grad_norm": 0.8555377053320512, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 26814 + }, + { + "epoch": 0.26815, + "grad_norm": 0.7196022469281201, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 26815 + }, + { + "epoch": 0.26816, + "grad_norm": 0.7315588476246786, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 26816 + }, + { + "epoch": 0.26817, + "grad_norm": 0.8199991250667189, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 26817 + }, + { + "epoch": 0.26818, + "grad_norm": 0.9238319690410037, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 26818 + }, + { + "epoch": 0.26819, + "grad_norm": 0.7632163132511808, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 26819 + }, + { + "epoch": 0.2682, + "grad_norm": 0.6998543010023512, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 26820 + }, + { + "epoch": 0.26821, + "grad_norm": 0.7116082409956874, + "learning_rate": 0.003, + "loss": 4.061, + "step": 26821 + }, + { + "epoch": 0.26822, + "grad_norm": 0.7422357109023314, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 26822 + }, + { + "epoch": 0.26823, + "grad_norm": 0.8270071713610251, + "learning_rate": 0.003, + "loss": 4.029, + "step": 26823 + }, + { + "epoch": 0.26824, + "grad_norm": 0.9616981269291839, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 26824 + }, + { + "epoch": 0.26825, + "grad_norm": 1.1042249593304798, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 26825 + }, + { + "epoch": 0.26826, + "grad_norm": 0.9855217666287078, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 26826 + }, + { + "epoch": 0.26827, + "grad_norm": 0.9745472722582985, + "learning_rate": 0.003, + "loss": 4.027, + "step": 26827 + }, + { + "epoch": 0.26828, + "grad_norm": 0.8503705073044941, + "learning_rate": 0.003, + "loss": 4.07, + "step": 26828 + }, + { + "epoch": 0.26829, + "grad_norm": 0.7778703583133462, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 26829 + }, + { + "epoch": 0.2683, + "grad_norm": 0.7495433301340197, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 26830 + }, + { + "epoch": 0.26831, + "grad_norm": 0.7790554520503469, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 26831 + }, + { + "epoch": 0.26832, + "grad_norm": 0.8573796288062671, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 26832 + }, + { + "epoch": 0.26833, + "grad_norm": 0.9980454705417464, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 26833 + }, + { + "epoch": 0.26834, + "grad_norm": 1.2160208084228539, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 26834 + }, + { + "epoch": 0.26835, + "grad_norm": 0.8427110402087028, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 26835 + }, + { + "epoch": 0.26836, + "grad_norm": 0.771167677946136, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 26836 + }, + { + "epoch": 0.26837, + "grad_norm": 0.7292664108361273, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 26837 + }, + { + "epoch": 0.26838, + "grad_norm": 0.8143570985261177, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 26838 + }, + { + "epoch": 0.26839, + "grad_norm": 0.7276168129385351, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 26839 + }, + { + "epoch": 0.2684, + "grad_norm": 0.6712678770209531, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 26840 + }, + { + "epoch": 0.26841, + "grad_norm": 0.7821541483127419, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 26841 + }, + { + "epoch": 0.26842, + "grad_norm": 0.8808361755103088, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 26842 + }, + { + "epoch": 0.26843, + "grad_norm": 1.0006668139850003, + "learning_rate": 0.003, + "loss": 4.055, + "step": 26843 + }, + { + "epoch": 0.26844, + "grad_norm": 1.1981811240725382, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 26844 + }, + { + "epoch": 0.26845, + "grad_norm": 0.9226266287032532, + "learning_rate": 0.003, + "loss": 4.045, + "step": 26845 + }, + { + "epoch": 0.26846, + "grad_norm": 0.9479866770346266, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 26846 + }, + { + "epoch": 0.26847, + "grad_norm": 0.9964166812344053, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 26847 + }, + { + "epoch": 0.26848, + "grad_norm": 0.9939089689554386, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 26848 + }, + { + "epoch": 0.26849, + "grad_norm": 0.9709551291704664, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 26849 + }, + { + "epoch": 0.2685, + "grad_norm": 1.020165551165223, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 26850 + }, + { + "epoch": 0.26851, + "grad_norm": 0.8863277947095122, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 26851 + }, + { + "epoch": 0.26852, + "grad_norm": 1.0491345460192698, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 26852 + }, + { + "epoch": 0.26853, + "grad_norm": 1.122350525012377, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 26853 + }, + { + "epoch": 0.26854, + "grad_norm": 0.8395329447220833, + "learning_rate": 0.003, + "loss": 4.034, + "step": 26854 + }, + { + "epoch": 0.26855, + "grad_norm": 0.8670364035506442, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 26855 + }, + { + "epoch": 0.26856, + "grad_norm": 0.9755359708285688, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 26856 + }, + { + "epoch": 0.26857, + "grad_norm": 1.0751348531951983, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 26857 + }, + { + "epoch": 0.26858, + "grad_norm": 0.911777289119023, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 26858 + }, + { + "epoch": 0.26859, + "grad_norm": 0.897093061950949, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 26859 + }, + { + "epoch": 0.2686, + "grad_norm": 1.0473880101989306, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 26860 + }, + { + "epoch": 0.26861, + "grad_norm": 0.7952945597290391, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 26861 + }, + { + "epoch": 0.26862, + "grad_norm": 0.6177411675039973, + "learning_rate": 0.003, + "loss": 4.044, + "step": 26862 + }, + { + "epoch": 0.26863, + "grad_norm": 0.6623986153088346, + "learning_rate": 0.003, + "loss": 4.047, + "step": 26863 + }, + { + "epoch": 0.26864, + "grad_norm": 0.7220856558057682, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 26864 + }, + { + "epoch": 0.26865, + "grad_norm": 0.9358425399702608, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 26865 + }, + { + "epoch": 0.26866, + "grad_norm": 1.1931919177174442, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 26866 + }, + { + "epoch": 0.26867, + "grad_norm": 0.6797091494412336, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 26867 + }, + { + "epoch": 0.26868, + "grad_norm": 0.5875798096716746, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 26868 + }, + { + "epoch": 0.26869, + "grad_norm": 0.5814283715012318, + "learning_rate": 0.003, + "loss": 4.037, + "step": 26869 + }, + { + "epoch": 0.2687, + "grad_norm": 0.5873322351884686, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 26870 + }, + { + "epoch": 0.26871, + "grad_norm": 0.6833628487384019, + "learning_rate": 0.003, + "loss": 4.043, + "step": 26871 + }, + { + "epoch": 0.26872, + "grad_norm": 0.7927082539457238, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 26872 + }, + { + "epoch": 0.26873, + "grad_norm": 0.8809740323636197, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 26873 + }, + { + "epoch": 0.26874, + "grad_norm": 0.948659879011425, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 26874 + }, + { + "epoch": 0.26875, + "grad_norm": 0.9311997267153175, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 26875 + }, + { + "epoch": 0.26876, + "grad_norm": 0.7967406915826373, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 26876 + }, + { + "epoch": 0.26877, + "grad_norm": 0.8290844738251041, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 26877 + }, + { + "epoch": 0.26878, + "grad_norm": 0.8733458167763013, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 26878 + }, + { + "epoch": 0.26879, + "grad_norm": 0.9768662749253959, + "learning_rate": 0.003, + "loss": 3.9969, + "step": 26879 + }, + { + "epoch": 0.2688, + "grad_norm": 1.048594960164772, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 26880 + }, + { + "epoch": 0.26881, + "grad_norm": 1.0123188758295774, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 26881 + }, + { + "epoch": 0.26882, + "grad_norm": 1.0862197214663303, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 26882 + }, + { + "epoch": 0.26883, + "grad_norm": 0.8773855842942945, + "learning_rate": 0.003, + "loss": 4.052, + "step": 26883 + }, + { + "epoch": 0.26884, + "grad_norm": 0.8248028156878672, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 26884 + }, + { + "epoch": 0.26885, + "grad_norm": 0.7927315459583432, + "learning_rate": 0.003, + "loss": 4.064, + "step": 26885 + }, + { + "epoch": 0.26886, + "grad_norm": 0.6773839829512543, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 26886 + }, + { + "epoch": 0.26887, + "grad_norm": 0.696430811548455, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 26887 + }, + { + "epoch": 0.26888, + "grad_norm": 0.8127898845894397, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 26888 + }, + { + "epoch": 0.26889, + "grad_norm": 0.9689792715233774, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 26889 + }, + { + "epoch": 0.2689, + "grad_norm": 1.1866731850723529, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 26890 + }, + { + "epoch": 0.26891, + "grad_norm": 0.7376585720892403, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 26891 + }, + { + "epoch": 0.26892, + "grad_norm": 0.6739327685475454, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 26892 + }, + { + "epoch": 0.26893, + "grad_norm": 0.8125013079317227, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 26893 + }, + { + "epoch": 0.26894, + "grad_norm": 0.8962227967136355, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 26894 + }, + { + "epoch": 0.26895, + "grad_norm": 0.9690916866230579, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 26895 + }, + { + "epoch": 0.26896, + "grad_norm": 0.9886794941489507, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 26896 + }, + { + "epoch": 0.26897, + "grad_norm": 0.9393189006689492, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 26897 + }, + { + "epoch": 0.26898, + "grad_norm": 0.9322092139945386, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 26898 + }, + { + "epoch": 0.26899, + "grad_norm": 0.9599102393951192, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 26899 + }, + { + "epoch": 0.269, + "grad_norm": 1.0529681803354871, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 26900 + }, + { + "epoch": 0.26901, + "grad_norm": 0.8937160858191697, + "learning_rate": 0.003, + "loss": 4.083, + "step": 26901 + }, + { + "epoch": 0.26902, + "grad_norm": 0.8697776795024742, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 26902 + }, + { + "epoch": 0.26903, + "grad_norm": 0.9441589877474087, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 26903 + }, + { + "epoch": 0.26904, + "grad_norm": 0.9408467398049117, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 26904 + }, + { + "epoch": 0.26905, + "grad_norm": 1.0427706769069847, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 26905 + }, + { + "epoch": 0.26906, + "grad_norm": 1.1779695878184737, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 26906 + }, + { + "epoch": 0.26907, + "grad_norm": 0.9261675096869896, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 26907 + }, + { + "epoch": 0.26908, + "grad_norm": 0.8907088884950782, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 26908 + }, + { + "epoch": 0.26909, + "grad_norm": 0.9754967728243714, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 26909 + }, + { + "epoch": 0.2691, + "grad_norm": 0.9227745262990029, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 26910 + }, + { + "epoch": 0.26911, + "grad_norm": 0.9354624523764706, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 26911 + }, + { + "epoch": 0.26912, + "grad_norm": 0.8434797054530185, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 26912 + }, + { + "epoch": 0.26913, + "grad_norm": 0.7113691771089369, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 26913 + }, + { + "epoch": 0.26914, + "grad_norm": 0.6890992042045092, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 26914 + }, + { + "epoch": 0.26915, + "grad_norm": 0.7743484120527323, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 26915 + }, + { + "epoch": 0.26916, + "grad_norm": 0.8732887868793218, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 26916 + }, + { + "epoch": 0.26917, + "grad_norm": 0.9988592849699839, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 26917 + }, + { + "epoch": 0.26918, + "grad_norm": 1.1767479433472368, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 26918 + }, + { + "epoch": 0.26919, + "grad_norm": 0.8185723291049996, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 26919 + }, + { + "epoch": 0.2692, + "grad_norm": 0.7013998169985814, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 26920 + }, + { + "epoch": 0.26921, + "grad_norm": 0.6495103391659941, + "learning_rate": 0.003, + "loss": 4.055, + "step": 26921 + }, + { + "epoch": 0.26922, + "grad_norm": 0.7492917000294076, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 26922 + }, + { + "epoch": 0.26923, + "grad_norm": 0.8915331539146565, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 26923 + }, + { + "epoch": 0.26924, + "grad_norm": 0.8773980385665381, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 26924 + }, + { + "epoch": 0.26925, + "grad_norm": 0.816513964939444, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 26925 + }, + { + "epoch": 0.26926, + "grad_norm": 0.9276444245280357, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 26926 + }, + { + "epoch": 0.26927, + "grad_norm": 0.9470220678491759, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 26927 + }, + { + "epoch": 0.26928, + "grad_norm": 0.8719539988217069, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 26928 + }, + { + "epoch": 0.26929, + "grad_norm": 0.9114960260049979, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 26929 + }, + { + "epoch": 0.2693, + "grad_norm": 0.8488171457589256, + "learning_rate": 0.003, + "loss": 4.051, + "step": 26930 + }, + { + "epoch": 0.26931, + "grad_norm": 0.7444986478644735, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 26931 + }, + { + "epoch": 0.26932, + "grad_norm": 0.7413878889613189, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 26932 + }, + { + "epoch": 0.26933, + "grad_norm": 0.6722002862335263, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 26933 + }, + { + "epoch": 0.26934, + "grad_norm": 0.6103254018920676, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 26934 + }, + { + "epoch": 0.26935, + "grad_norm": 0.6297175998152325, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 26935 + }, + { + "epoch": 0.26936, + "grad_norm": 0.7337259835500222, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 26936 + }, + { + "epoch": 0.26937, + "grad_norm": 0.9530827528919407, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 26937 + }, + { + "epoch": 0.26938, + "grad_norm": 1.2854979574138448, + "learning_rate": 0.003, + "loss": 4.037, + "step": 26938 + }, + { + "epoch": 0.26939, + "grad_norm": 0.6861287002916118, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 26939 + }, + { + "epoch": 0.2694, + "grad_norm": 0.7443794193796209, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 26940 + }, + { + "epoch": 0.26941, + "grad_norm": 0.8196112791522115, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 26941 + }, + { + "epoch": 0.26942, + "grad_norm": 0.8466972595900097, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 26942 + }, + { + "epoch": 0.26943, + "grad_norm": 0.8099766426461441, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 26943 + }, + { + "epoch": 0.26944, + "grad_norm": 0.8899064059408505, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 26944 + }, + { + "epoch": 0.26945, + "grad_norm": 0.9318379120267478, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 26945 + }, + { + "epoch": 0.26946, + "grad_norm": 0.8771469781142658, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 26946 + }, + { + "epoch": 0.26947, + "grad_norm": 0.8386181897446428, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 26947 + }, + { + "epoch": 0.26948, + "grad_norm": 0.9618846668564298, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 26948 + }, + { + "epoch": 0.26949, + "grad_norm": 1.2857319011480317, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 26949 + }, + { + "epoch": 0.2695, + "grad_norm": 0.8060868370808525, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 26950 + }, + { + "epoch": 0.26951, + "grad_norm": 0.6744944940911806, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 26951 + }, + { + "epoch": 0.26952, + "grad_norm": 0.6311334445449999, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 26952 + }, + { + "epoch": 0.26953, + "grad_norm": 0.6138559541103576, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 26953 + }, + { + "epoch": 0.26954, + "grad_norm": 0.7207008896100006, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 26954 + }, + { + "epoch": 0.26955, + "grad_norm": 0.7960129467106797, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 26955 + }, + { + "epoch": 0.26956, + "grad_norm": 1.0306091938573996, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 26956 + }, + { + "epoch": 0.26957, + "grad_norm": 1.2399489206016752, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 26957 + }, + { + "epoch": 0.26958, + "grad_norm": 0.6633366064492253, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 26958 + }, + { + "epoch": 0.26959, + "grad_norm": 0.8914599232934599, + "learning_rate": 0.003, + "loss": 4.042, + "step": 26959 + }, + { + "epoch": 0.2696, + "grad_norm": 1.1988140196872528, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 26960 + }, + { + "epoch": 0.26961, + "grad_norm": 0.7619094777538519, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 26961 + }, + { + "epoch": 0.26962, + "grad_norm": 0.726379209536206, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 26962 + }, + { + "epoch": 0.26963, + "grad_norm": 0.7616517343158844, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 26963 + }, + { + "epoch": 0.26964, + "grad_norm": 0.8716842871721308, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 26964 + }, + { + "epoch": 0.26965, + "grad_norm": 0.8680209714315246, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 26965 + }, + { + "epoch": 0.26966, + "grad_norm": 0.8236976839929548, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 26966 + }, + { + "epoch": 0.26967, + "grad_norm": 0.8870763590881062, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 26967 + }, + { + "epoch": 0.26968, + "grad_norm": 0.8928321364216114, + "learning_rate": 0.003, + "loss": 4.067, + "step": 26968 + }, + { + "epoch": 0.26969, + "grad_norm": 1.107626601959848, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 26969 + }, + { + "epoch": 0.2697, + "grad_norm": 1.1838746563329594, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 26970 + }, + { + "epoch": 0.26971, + "grad_norm": 0.9514705323076532, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 26971 + }, + { + "epoch": 0.26972, + "grad_norm": 1.0454568052408633, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 26972 + }, + { + "epoch": 0.26973, + "grad_norm": 0.972162909278101, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 26973 + }, + { + "epoch": 0.26974, + "grad_norm": 0.8729060866361867, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 26974 + }, + { + "epoch": 0.26975, + "grad_norm": 0.7281073887983386, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 26975 + }, + { + "epoch": 0.26976, + "grad_norm": 0.722974268624212, + "learning_rate": 0.003, + "loss": 4.0042, + "step": 26976 + }, + { + "epoch": 0.26977, + "grad_norm": 0.7088278974867546, + "learning_rate": 0.003, + "loss": 4.055, + "step": 26977 + }, + { + "epoch": 0.26978, + "grad_norm": 0.8154381957769654, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 26978 + }, + { + "epoch": 0.26979, + "grad_norm": 0.8019128991498526, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 26979 + }, + { + "epoch": 0.2698, + "grad_norm": 0.9055559157093138, + "learning_rate": 0.003, + "loss": 4.065, + "step": 26980 + }, + { + "epoch": 0.26981, + "grad_norm": 1.03434086868687, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 26981 + }, + { + "epoch": 0.26982, + "grad_norm": 1.078501936384365, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 26982 + }, + { + "epoch": 0.26983, + "grad_norm": 0.979355409542402, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 26983 + }, + { + "epoch": 0.26984, + "grad_norm": 1.1493175807929523, + "learning_rate": 0.003, + "loss": 4.064, + "step": 26984 + }, + { + "epoch": 0.26985, + "grad_norm": 1.009332427987294, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 26985 + }, + { + "epoch": 0.26986, + "grad_norm": 0.9922820487069157, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 26986 + }, + { + "epoch": 0.26987, + "grad_norm": 0.9817967901810096, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 26987 + }, + { + "epoch": 0.26988, + "grad_norm": 0.9495963392110601, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 26988 + }, + { + "epoch": 0.26989, + "grad_norm": 1.006950027944499, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 26989 + }, + { + "epoch": 0.2699, + "grad_norm": 0.9034623035139304, + "learning_rate": 0.003, + "loss": 4.034, + "step": 26990 + }, + { + "epoch": 0.26991, + "grad_norm": 0.903049053305745, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 26991 + }, + { + "epoch": 0.26992, + "grad_norm": 0.9834822346135722, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 26992 + }, + { + "epoch": 0.26993, + "grad_norm": 0.9646853458619823, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 26993 + }, + { + "epoch": 0.26994, + "grad_norm": 0.9130505587759331, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 26994 + }, + { + "epoch": 0.26995, + "grad_norm": 0.8599653568553773, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 26995 + }, + { + "epoch": 0.26996, + "grad_norm": 0.7586051747483734, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 26996 + }, + { + "epoch": 0.26997, + "grad_norm": 0.7881460368330638, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 26997 + }, + { + "epoch": 0.26998, + "grad_norm": 0.8835350598449638, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 26998 + }, + { + "epoch": 0.26999, + "grad_norm": 1.0002215328670894, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 26999 + }, + { + "epoch": 0.27, + "grad_norm": 1.2678592106742408, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 27000 + }, + { + "epoch": 0.27001, + "grad_norm": 0.7888807958359298, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 27001 + }, + { + "epoch": 0.27002, + "grad_norm": 0.8001773270002617, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 27002 + }, + { + "epoch": 0.27003, + "grad_norm": 0.7842424291198244, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 27003 + }, + { + "epoch": 0.27004, + "grad_norm": 0.8691221990773986, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 27004 + }, + { + "epoch": 0.27005, + "grad_norm": 0.8635074004196631, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 27005 + }, + { + "epoch": 0.27006, + "grad_norm": 0.9691884263732948, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 27006 + }, + { + "epoch": 0.27007, + "grad_norm": 1.1158654969790378, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 27007 + }, + { + "epoch": 0.27008, + "grad_norm": 0.8842773181896927, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 27008 + }, + { + "epoch": 0.27009, + "grad_norm": 0.8465386885128169, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 27009 + }, + { + "epoch": 0.2701, + "grad_norm": 0.898879990236479, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 27010 + }, + { + "epoch": 0.27011, + "grad_norm": 0.8585009785537153, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 27011 + }, + { + "epoch": 0.27012, + "grad_norm": 0.8656905129709597, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 27012 + }, + { + "epoch": 0.27013, + "grad_norm": 0.8727755915044394, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 27013 + }, + { + "epoch": 0.27014, + "grad_norm": 1.0102060053541189, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 27014 + }, + { + "epoch": 0.27015, + "grad_norm": 0.9879825045425192, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 27015 + }, + { + "epoch": 0.27016, + "grad_norm": 0.9505205842954543, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 27016 + }, + { + "epoch": 0.27017, + "grad_norm": 0.901354675971913, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 27017 + }, + { + "epoch": 0.27018, + "grad_norm": 0.8583664233017747, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 27018 + }, + { + "epoch": 0.27019, + "grad_norm": 0.6834413529053646, + "learning_rate": 0.003, + "loss": 4.046, + "step": 27019 + }, + { + "epoch": 0.2702, + "grad_norm": 0.6460403116168174, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 27020 + }, + { + "epoch": 0.27021, + "grad_norm": 0.7822032326101864, + "learning_rate": 0.003, + "loss": 4.027, + "step": 27021 + }, + { + "epoch": 0.27022, + "grad_norm": 0.8600317593547536, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 27022 + }, + { + "epoch": 0.27023, + "grad_norm": 0.9050852518059926, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 27023 + }, + { + "epoch": 0.27024, + "grad_norm": 0.8660682573181737, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 27024 + }, + { + "epoch": 0.27025, + "grad_norm": 0.7668665843089587, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 27025 + }, + { + "epoch": 0.27026, + "grad_norm": 0.7728861933718528, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 27026 + }, + { + "epoch": 0.27027, + "grad_norm": 0.8220203474732518, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 27027 + }, + { + "epoch": 0.27028, + "grad_norm": 0.9214167674101033, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 27028 + }, + { + "epoch": 0.27029, + "grad_norm": 0.9083241540942567, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 27029 + }, + { + "epoch": 0.2703, + "grad_norm": 0.8370697180268717, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 27030 + }, + { + "epoch": 0.27031, + "grad_norm": 0.7671989562499478, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 27031 + }, + { + "epoch": 0.27032, + "grad_norm": 0.6628690520304105, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 27032 + }, + { + "epoch": 0.27033, + "grad_norm": 0.5982827661267192, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 27033 + }, + { + "epoch": 0.27034, + "grad_norm": 0.6243591715428094, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 27034 + }, + { + "epoch": 0.27035, + "grad_norm": 0.652055125391747, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 27035 + }, + { + "epoch": 0.27036, + "grad_norm": 0.7788987968893006, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 27036 + }, + { + "epoch": 0.27037, + "grad_norm": 0.8685538520831548, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 27037 + }, + { + "epoch": 0.27038, + "grad_norm": 1.2172404263937173, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 27038 + }, + { + "epoch": 0.27039, + "grad_norm": 0.995505119093717, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 27039 + }, + { + "epoch": 0.2704, + "grad_norm": 0.9476755162293994, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 27040 + }, + { + "epoch": 0.27041, + "grad_norm": 0.879207148536449, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 27041 + }, + { + "epoch": 0.27042, + "grad_norm": 0.7897383323367847, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 27042 + }, + { + "epoch": 0.27043, + "grad_norm": 0.7309952298592794, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 27043 + }, + { + "epoch": 0.27044, + "grad_norm": 0.8461307769230635, + "learning_rate": 0.003, + "loss": 4.049, + "step": 27044 + }, + { + "epoch": 0.27045, + "grad_norm": 0.9665470438237945, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 27045 + }, + { + "epoch": 0.27046, + "grad_norm": 0.9448790184090723, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 27046 + }, + { + "epoch": 0.27047, + "grad_norm": 1.0562687809824105, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 27047 + }, + { + "epoch": 0.27048, + "grad_norm": 0.9311199705402079, + "learning_rate": 0.003, + "loss": 3.9959, + "step": 27048 + }, + { + "epoch": 0.27049, + "grad_norm": 0.7690817110686762, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 27049 + }, + { + "epoch": 0.2705, + "grad_norm": 0.6781421195807013, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 27050 + }, + { + "epoch": 0.27051, + "grad_norm": 0.6388413510725814, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 27051 + }, + { + "epoch": 0.27052, + "grad_norm": 0.6114523095010685, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 27052 + }, + { + "epoch": 0.27053, + "grad_norm": 0.6098368230554445, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 27053 + }, + { + "epoch": 0.27054, + "grad_norm": 0.6168358878423139, + "learning_rate": 0.003, + "loss": 4.029, + "step": 27054 + }, + { + "epoch": 0.27055, + "grad_norm": 0.6570552168736712, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 27055 + }, + { + "epoch": 0.27056, + "grad_norm": 0.6483617128459038, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 27056 + }, + { + "epoch": 0.27057, + "grad_norm": 0.6756006612211097, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 27057 + }, + { + "epoch": 0.27058, + "grad_norm": 0.7435872660058324, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 27058 + }, + { + "epoch": 0.27059, + "grad_norm": 0.7840719312028, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 27059 + }, + { + "epoch": 0.2706, + "grad_norm": 0.9207617430543449, + "learning_rate": 0.003, + "loss": 4.047, + "step": 27060 + }, + { + "epoch": 0.27061, + "grad_norm": 1.136363784082638, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 27061 + }, + { + "epoch": 0.27062, + "grad_norm": 0.8025122709638268, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 27062 + }, + { + "epoch": 0.27063, + "grad_norm": 0.9095490548245695, + "learning_rate": 0.003, + "loss": 4.009, + "step": 27063 + }, + { + "epoch": 0.27064, + "grad_norm": 1.2042498680299538, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 27064 + }, + { + "epoch": 0.27065, + "grad_norm": 1.0800332635163825, + "learning_rate": 0.003, + "loss": 4.008, + "step": 27065 + }, + { + "epoch": 0.27066, + "grad_norm": 0.8524130787896782, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 27066 + }, + { + "epoch": 0.27067, + "grad_norm": 0.865176317665912, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 27067 + }, + { + "epoch": 0.27068, + "grad_norm": 0.978232966115204, + "learning_rate": 0.003, + "loss": 4.062, + "step": 27068 + }, + { + "epoch": 0.27069, + "grad_norm": 1.0898764627062663, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 27069 + }, + { + "epoch": 0.2707, + "grad_norm": 0.8569622692460206, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 27070 + }, + { + "epoch": 0.27071, + "grad_norm": 0.7894768430029436, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 27071 + }, + { + "epoch": 0.27072, + "grad_norm": 0.9362915704003091, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 27072 + }, + { + "epoch": 0.27073, + "grad_norm": 1.1120966388838927, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 27073 + }, + { + "epoch": 0.27074, + "grad_norm": 0.9224413003465409, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 27074 + }, + { + "epoch": 0.27075, + "grad_norm": 0.939181567656111, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 27075 + }, + { + "epoch": 0.27076, + "grad_norm": 0.851244091339666, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 27076 + }, + { + "epoch": 0.27077, + "grad_norm": 0.7567516484900256, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 27077 + }, + { + "epoch": 0.27078, + "grad_norm": 0.7463332781513192, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 27078 + }, + { + "epoch": 0.27079, + "grad_norm": 0.8148149789702981, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 27079 + }, + { + "epoch": 0.2708, + "grad_norm": 0.9938724862211209, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 27080 + }, + { + "epoch": 0.27081, + "grad_norm": 1.180361774920303, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 27081 + }, + { + "epoch": 0.27082, + "grad_norm": 1.1600390834129197, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 27082 + }, + { + "epoch": 0.27083, + "grad_norm": 0.9054031154576953, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 27083 + }, + { + "epoch": 0.27084, + "grad_norm": 0.7953268114392842, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 27084 + }, + { + "epoch": 0.27085, + "grad_norm": 0.7356133707957281, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 27085 + }, + { + "epoch": 0.27086, + "grad_norm": 0.8278736200139694, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 27086 + }, + { + "epoch": 0.27087, + "grad_norm": 1.0195372495536443, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 27087 + }, + { + "epoch": 0.27088, + "grad_norm": 1.0690949917266133, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 27088 + }, + { + "epoch": 0.27089, + "grad_norm": 0.8017696662672297, + "learning_rate": 0.003, + "loss": 4.072, + "step": 27089 + }, + { + "epoch": 0.2709, + "grad_norm": 0.7750292715380511, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 27090 + }, + { + "epoch": 0.27091, + "grad_norm": 0.8523388747862698, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 27091 + }, + { + "epoch": 0.27092, + "grad_norm": 0.9283613533948846, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 27092 + }, + { + "epoch": 0.27093, + "grad_norm": 1.1584453953320673, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 27093 + }, + { + "epoch": 0.27094, + "grad_norm": 0.9935158028023288, + "learning_rate": 0.003, + "loss": 4.035, + "step": 27094 + }, + { + "epoch": 0.27095, + "grad_norm": 0.9300280200864448, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 27095 + }, + { + "epoch": 0.27096, + "grad_norm": 0.7730289487522545, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 27096 + }, + { + "epoch": 0.27097, + "grad_norm": 0.7001803330413654, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 27097 + }, + { + "epoch": 0.27098, + "grad_norm": 0.731044577203183, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 27098 + }, + { + "epoch": 0.27099, + "grad_norm": 0.7050428195909509, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 27099 + }, + { + "epoch": 0.271, + "grad_norm": 0.6559706172796037, + "learning_rate": 0.003, + "loss": 4.024, + "step": 27100 + }, + { + "epoch": 0.27101, + "grad_norm": 0.7276162424392029, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 27101 + }, + { + "epoch": 0.27102, + "grad_norm": 0.8857351904951477, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 27102 + }, + { + "epoch": 0.27103, + "grad_norm": 1.070338748386965, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 27103 + }, + { + "epoch": 0.27104, + "grad_norm": 1.0632822661633448, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 27104 + }, + { + "epoch": 0.27105, + "grad_norm": 1.0528687843837061, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 27105 + }, + { + "epoch": 0.27106, + "grad_norm": 0.8160591683191389, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 27106 + }, + { + "epoch": 0.27107, + "grad_norm": 0.702026301860167, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 27107 + }, + { + "epoch": 0.27108, + "grad_norm": 0.7221685866334959, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 27108 + }, + { + "epoch": 0.27109, + "grad_norm": 0.7873405347080112, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 27109 + }, + { + "epoch": 0.2711, + "grad_norm": 0.8800639736338831, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 27110 + }, + { + "epoch": 0.27111, + "grad_norm": 0.9885116721078356, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 27111 + }, + { + "epoch": 0.27112, + "grad_norm": 0.984865766560286, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 27112 + }, + { + "epoch": 0.27113, + "grad_norm": 0.9557134089537638, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 27113 + }, + { + "epoch": 0.27114, + "grad_norm": 0.8856643151610055, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 27114 + }, + { + "epoch": 0.27115, + "grad_norm": 0.8519398837568644, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 27115 + }, + { + "epoch": 0.27116, + "grad_norm": 0.9778039974639392, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 27116 + }, + { + "epoch": 0.27117, + "grad_norm": 1.1613461738282431, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 27117 + }, + { + "epoch": 0.27118, + "grad_norm": 0.7943938679536395, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 27118 + }, + { + "epoch": 0.27119, + "grad_norm": 0.6455142657146546, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 27119 + }, + { + "epoch": 0.2712, + "grad_norm": 0.6892573230524919, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 27120 + }, + { + "epoch": 0.27121, + "grad_norm": 0.6890125558025136, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 27121 + }, + { + "epoch": 0.27122, + "grad_norm": 0.7205962020558068, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 27122 + }, + { + "epoch": 0.27123, + "grad_norm": 0.7472643585410962, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 27123 + }, + { + "epoch": 0.27124, + "grad_norm": 0.8336672277688877, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 27124 + }, + { + "epoch": 0.27125, + "grad_norm": 0.8822704681408061, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 27125 + }, + { + "epoch": 0.27126, + "grad_norm": 0.8932075841157868, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 27126 + }, + { + "epoch": 0.27127, + "grad_norm": 1.0276218172856988, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 27127 + }, + { + "epoch": 0.27128, + "grad_norm": 1.0310460081609527, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 27128 + }, + { + "epoch": 0.27129, + "grad_norm": 1.1186343399518717, + "learning_rate": 0.003, + "loss": 4.056, + "step": 27129 + }, + { + "epoch": 0.2713, + "grad_norm": 1.0456089184684987, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 27130 + }, + { + "epoch": 0.27131, + "grad_norm": 1.0875096627611427, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 27131 + }, + { + "epoch": 0.27132, + "grad_norm": 0.9541387858588575, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 27132 + }, + { + "epoch": 0.27133, + "grad_norm": 0.9677484385097217, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 27133 + }, + { + "epoch": 0.27134, + "grad_norm": 1.063895830873757, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 27134 + }, + { + "epoch": 0.27135, + "grad_norm": 0.9858596988610876, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 27135 + }, + { + "epoch": 0.27136, + "grad_norm": 0.9713079182063803, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 27136 + }, + { + "epoch": 0.27137, + "grad_norm": 0.9061910493523073, + "learning_rate": 0.003, + "loss": 4.055, + "step": 27137 + }, + { + "epoch": 0.27138, + "grad_norm": 0.861177871664807, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 27138 + }, + { + "epoch": 0.27139, + "grad_norm": 0.8723914544026439, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 27139 + }, + { + "epoch": 0.2714, + "grad_norm": 0.85586348583267, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 27140 + }, + { + "epoch": 0.27141, + "grad_norm": 0.8224170188597626, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 27141 + }, + { + "epoch": 0.27142, + "grad_norm": 0.7654828284087059, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 27142 + }, + { + "epoch": 0.27143, + "grad_norm": 0.7853944433263345, + "learning_rate": 0.003, + "loss": 4.039, + "step": 27143 + }, + { + "epoch": 0.27144, + "grad_norm": 0.762823941353146, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 27144 + }, + { + "epoch": 0.27145, + "grad_norm": 0.7591514787711713, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 27145 + }, + { + "epoch": 0.27146, + "grad_norm": 0.6642447035899998, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 27146 + }, + { + "epoch": 0.27147, + "grad_norm": 0.5599208879429333, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 27147 + }, + { + "epoch": 0.27148, + "grad_norm": 0.557031027162787, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 27148 + }, + { + "epoch": 0.27149, + "grad_norm": 0.6624987492243699, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 27149 + }, + { + "epoch": 0.2715, + "grad_norm": 0.8205766165204013, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 27150 + }, + { + "epoch": 0.27151, + "grad_norm": 1.0809207998766002, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 27151 + }, + { + "epoch": 0.27152, + "grad_norm": 1.1438267998468274, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 27152 + }, + { + "epoch": 0.27153, + "grad_norm": 0.755220917245375, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 27153 + }, + { + "epoch": 0.27154, + "grad_norm": 0.5773218145613392, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 27154 + }, + { + "epoch": 0.27155, + "grad_norm": 0.5833234859079485, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 27155 + }, + { + "epoch": 0.27156, + "grad_norm": 0.5486132756215911, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 27156 + }, + { + "epoch": 0.27157, + "grad_norm": 0.5170053161382886, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 27157 + }, + { + "epoch": 0.27158, + "grad_norm": 0.5594581439251992, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 27158 + }, + { + "epoch": 0.27159, + "grad_norm": 0.6156913287480137, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 27159 + }, + { + "epoch": 0.2716, + "grad_norm": 0.6756345566314433, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 27160 + }, + { + "epoch": 0.27161, + "grad_norm": 0.6587090753572383, + "learning_rate": 0.003, + "loss": 4.012, + "step": 27161 + }, + { + "epoch": 0.27162, + "grad_norm": 0.6783537475995175, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 27162 + }, + { + "epoch": 0.27163, + "grad_norm": 0.6963303488291372, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 27163 + }, + { + "epoch": 0.27164, + "grad_norm": 0.6358036143558932, + "learning_rate": 0.003, + "loss": 4.03, + "step": 27164 + }, + { + "epoch": 0.27165, + "grad_norm": 0.6808144367447524, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 27165 + }, + { + "epoch": 0.27166, + "grad_norm": 0.7516887342657324, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 27166 + }, + { + "epoch": 0.27167, + "grad_norm": 0.9104422701216477, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 27167 + }, + { + "epoch": 0.27168, + "grad_norm": 1.1297040554046238, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 27168 + }, + { + "epoch": 0.27169, + "grad_norm": 1.1366329671051423, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 27169 + }, + { + "epoch": 0.2717, + "grad_norm": 1.0846159002262368, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 27170 + }, + { + "epoch": 0.27171, + "grad_norm": 0.920353978863042, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 27171 + }, + { + "epoch": 0.27172, + "grad_norm": 0.867191921196321, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 27172 + }, + { + "epoch": 0.27173, + "grad_norm": 0.9742443538714569, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 27173 + }, + { + "epoch": 0.27174, + "grad_norm": 1.2222349070009826, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 27174 + }, + { + "epoch": 0.27175, + "grad_norm": 0.9470124587449692, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 27175 + }, + { + "epoch": 0.27176, + "grad_norm": 0.9986427860980546, + "learning_rate": 0.003, + "loss": 3.9932, + "step": 27176 + }, + { + "epoch": 0.27177, + "grad_norm": 1.0061134088639327, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 27177 + }, + { + "epoch": 0.27178, + "grad_norm": 1.068778278612357, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 27178 + }, + { + "epoch": 0.27179, + "grad_norm": 0.8775087932883221, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 27179 + }, + { + "epoch": 0.2718, + "grad_norm": 0.9305609233138967, + "learning_rate": 0.003, + "loss": 4.056, + "step": 27180 + }, + { + "epoch": 0.27181, + "grad_norm": 0.9861934838987505, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 27181 + }, + { + "epoch": 0.27182, + "grad_norm": 1.0686209603007657, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 27182 + }, + { + "epoch": 0.27183, + "grad_norm": 1.1943199834725478, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 27183 + }, + { + "epoch": 0.27184, + "grad_norm": 0.9376355587884407, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 27184 + }, + { + "epoch": 0.27185, + "grad_norm": 0.9271383828483835, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 27185 + }, + { + "epoch": 0.27186, + "grad_norm": 1.130912385689637, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 27186 + }, + { + "epoch": 0.27187, + "grad_norm": 1.2605807924324426, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 27187 + }, + { + "epoch": 0.27188, + "grad_norm": 0.8822055764532039, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 27188 + }, + { + "epoch": 0.27189, + "grad_norm": 0.7044681395386407, + "learning_rate": 0.003, + "loss": 4.051, + "step": 27189 + }, + { + "epoch": 0.2719, + "grad_norm": 0.7237612405370422, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 27190 + }, + { + "epoch": 0.27191, + "grad_norm": 0.7319327968793413, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 27191 + }, + { + "epoch": 0.27192, + "grad_norm": 0.818410428876247, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 27192 + }, + { + "epoch": 0.27193, + "grad_norm": 0.8660669763886822, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 27193 + }, + { + "epoch": 0.27194, + "grad_norm": 0.7490120690025587, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 27194 + }, + { + "epoch": 0.27195, + "grad_norm": 0.6820603575835106, + "learning_rate": 0.003, + "loss": 4.037, + "step": 27195 + }, + { + "epoch": 0.27196, + "grad_norm": 0.861805402471239, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 27196 + }, + { + "epoch": 0.27197, + "grad_norm": 1.2378956694820602, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 27197 + }, + { + "epoch": 0.27198, + "grad_norm": 0.8402970718769878, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 27198 + }, + { + "epoch": 0.27199, + "grad_norm": 0.7273603596835698, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 27199 + }, + { + "epoch": 0.272, + "grad_norm": 0.7624880957382534, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 27200 + }, + { + "epoch": 0.27201, + "grad_norm": 0.7794119083905486, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 27201 + }, + { + "epoch": 0.27202, + "grad_norm": 0.758622059909281, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 27202 + }, + { + "epoch": 0.27203, + "grad_norm": 0.7356742586105665, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 27203 + }, + { + "epoch": 0.27204, + "grad_norm": 0.7246696461181776, + "learning_rate": 0.003, + "loss": 4.036, + "step": 27204 + }, + { + "epoch": 0.27205, + "grad_norm": 0.5788274821704399, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 27205 + }, + { + "epoch": 0.27206, + "grad_norm": 0.6154187907496541, + "learning_rate": 0.003, + "loss": 4.039, + "step": 27206 + }, + { + "epoch": 0.27207, + "grad_norm": 0.6407113391895068, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 27207 + }, + { + "epoch": 0.27208, + "grad_norm": 0.6930326619486366, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 27208 + }, + { + "epoch": 0.27209, + "grad_norm": 0.7517166962841816, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 27209 + }, + { + "epoch": 0.2721, + "grad_norm": 0.9916192344650318, + "learning_rate": 0.003, + "loss": 4.03, + "step": 27210 + }, + { + "epoch": 0.27211, + "grad_norm": 1.3079968052598168, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 27211 + }, + { + "epoch": 0.27212, + "grad_norm": 0.9239598896598635, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 27212 + }, + { + "epoch": 0.27213, + "grad_norm": 0.8342140539615156, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 27213 + }, + { + "epoch": 0.27214, + "grad_norm": 0.7546189689001775, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 27214 + }, + { + "epoch": 0.27215, + "grad_norm": 0.8955815721229361, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 27215 + }, + { + "epoch": 0.27216, + "grad_norm": 1.0597620520777873, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 27216 + }, + { + "epoch": 0.27217, + "grad_norm": 0.9424885710364823, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 27217 + }, + { + "epoch": 0.27218, + "grad_norm": 0.8177987315497143, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 27218 + }, + { + "epoch": 0.27219, + "grad_norm": 0.7475638949977249, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 27219 + }, + { + "epoch": 0.2722, + "grad_norm": 0.8014783029565177, + "learning_rate": 0.003, + "loss": 4.029, + "step": 27220 + }, + { + "epoch": 0.27221, + "grad_norm": 0.8215270742283234, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 27221 + }, + { + "epoch": 0.27222, + "grad_norm": 0.8347503104742771, + "learning_rate": 0.003, + "loss": 4.028, + "step": 27222 + }, + { + "epoch": 0.27223, + "grad_norm": 0.8350513510638556, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 27223 + }, + { + "epoch": 0.27224, + "grad_norm": 0.8477893458675214, + "learning_rate": 0.003, + "loss": 4.052, + "step": 27224 + }, + { + "epoch": 0.27225, + "grad_norm": 0.8355419986007538, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 27225 + }, + { + "epoch": 0.27226, + "grad_norm": 0.8938793453500699, + "learning_rate": 0.003, + "loss": 4.047, + "step": 27226 + }, + { + "epoch": 0.27227, + "grad_norm": 0.9622692638370453, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 27227 + }, + { + "epoch": 0.27228, + "grad_norm": 1.0722030414873116, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 27228 + }, + { + "epoch": 0.27229, + "grad_norm": 1.070775154175559, + "learning_rate": 0.003, + "loss": 4.064, + "step": 27229 + }, + { + "epoch": 0.2723, + "grad_norm": 0.9539822459429551, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 27230 + }, + { + "epoch": 0.27231, + "grad_norm": 0.914327658204957, + "learning_rate": 0.003, + "loss": 4.047, + "step": 27231 + }, + { + "epoch": 0.27232, + "grad_norm": 0.8043332402395875, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 27232 + }, + { + "epoch": 0.27233, + "grad_norm": 0.8544020222686842, + "learning_rate": 0.003, + "loss": 4.049, + "step": 27233 + }, + { + "epoch": 0.27234, + "grad_norm": 0.8237668204075224, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 27234 + }, + { + "epoch": 0.27235, + "grad_norm": 1.0479000289143938, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 27235 + }, + { + "epoch": 0.27236, + "grad_norm": 1.3968133038216284, + "learning_rate": 0.003, + "loss": 4.066, + "step": 27236 + }, + { + "epoch": 0.27237, + "grad_norm": 0.7369511324294364, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 27237 + }, + { + "epoch": 0.27238, + "grad_norm": 0.632308940995445, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 27238 + }, + { + "epoch": 0.27239, + "grad_norm": 0.7400695892031343, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 27239 + }, + { + "epoch": 0.2724, + "grad_norm": 0.7754137236259059, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 27240 + }, + { + "epoch": 0.27241, + "grad_norm": 0.8545998274122126, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 27241 + }, + { + "epoch": 0.27242, + "grad_norm": 0.9077315981367386, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 27242 + }, + { + "epoch": 0.27243, + "grad_norm": 0.9886318804741363, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 27243 + }, + { + "epoch": 0.27244, + "grad_norm": 0.9752201253308417, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 27244 + }, + { + "epoch": 0.27245, + "grad_norm": 1.0083065574250503, + "learning_rate": 0.003, + "loss": 4.008, + "step": 27245 + }, + { + "epoch": 0.27246, + "grad_norm": 1.086315963474152, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 27246 + }, + { + "epoch": 0.27247, + "grad_norm": 0.9159454827544614, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 27247 + }, + { + "epoch": 0.27248, + "grad_norm": 1.0142733265473942, + "learning_rate": 0.003, + "loss": 4.045, + "step": 27248 + }, + { + "epoch": 0.27249, + "grad_norm": 1.0016145143010495, + "learning_rate": 0.003, + "loss": 4.063, + "step": 27249 + }, + { + "epoch": 0.2725, + "grad_norm": 0.869441591027775, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 27250 + }, + { + "epoch": 0.27251, + "grad_norm": 0.8866232941368838, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 27251 + }, + { + "epoch": 0.27252, + "grad_norm": 0.9350087517829194, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 27252 + }, + { + "epoch": 0.27253, + "grad_norm": 1.1177539187283532, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 27253 + }, + { + "epoch": 0.27254, + "grad_norm": 0.8880282367857625, + "learning_rate": 0.003, + "loss": 4.03, + "step": 27254 + }, + { + "epoch": 0.27255, + "grad_norm": 0.9411424436684704, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 27255 + }, + { + "epoch": 0.27256, + "grad_norm": 0.7783286849407326, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 27256 + }, + { + "epoch": 0.27257, + "grad_norm": 0.8126324789590152, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 27257 + }, + { + "epoch": 0.27258, + "grad_norm": 0.8290273613677162, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 27258 + }, + { + "epoch": 0.27259, + "grad_norm": 0.9684653998788338, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 27259 + }, + { + "epoch": 0.2726, + "grad_norm": 0.9244270178857742, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 27260 + }, + { + "epoch": 0.27261, + "grad_norm": 0.9217923968216553, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 27261 + }, + { + "epoch": 0.27262, + "grad_norm": 0.9587059881403286, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 27262 + }, + { + "epoch": 0.27263, + "grad_norm": 1.0331796733005223, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 27263 + }, + { + "epoch": 0.27264, + "grad_norm": 1.0092460868206161, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 27264 + }, + { + "epoch": 0.27265, + "grad_norm": 0.880725136941026, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 27265 + }, + { + "epoch": 0.27266, + "grad_norm": 0.8665104284707065, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 27266 + }, + { + "epoch": 0.27267, + "grad_norm": 0.9551963454080732, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 27267 + }, + { + "epoch": 0.27268, + "grad_norm": 0.8872078032651591, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 27268 + }, + { + "epoch": 0.27269, + "grad_norm": 0.9692175837862683, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 27269 + }, + { + "epoch": 0.2727, + "grad_norm": 1.074150428979464, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 27270 + }, + { + "epoch": 0.27271, + "grad_norm": 0.9143091916217028, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 27271 + }, + { + "epoch": 0.27272, + "grad_norm": 0.8822137906872127, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 27272 + }, + { + "epoch": 0.27273, + "grad_norm": 0.9499452123431154, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 27273 + }, + { + "epoch": 0.27274, + "grad_norm": 1.0456034779341927, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 27274 + }, + { + "epoch": 0.27275, + "grad_norm": 0.9729708578284676, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 27275 + }, + { + "epoch": 0.27276, + "grad_norm": 1.0633725646122136, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 27276 + }, + { + "epoch": 0.27277, + "grad_norm": 0.9537859806366207, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 27277 + }, + { + "epoch": 0.27278, + "grad_norm": 0.8493531283932716, + "learning_rate": 0.003, + "loss": 4.026, + "step": 27278 + }, + { + "epoch": 0.27279, + "grad_norm": 0.8276759932673456, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 27279 + }, + { + "epoch": 0.2728, + "grad_norm": 0.8317087605367821, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 27280 + }, + { + "epoch": 0.27281, + "grad_norm": 0.7715816180883778, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 27281 + }, + { + "epoch": 0.27282, + "grad_norm": 0.7356442549104868, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 27282 + }, + { + "epoch": 0.27283, + "grad_norm": 0.6914130122683392, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 27283 + }, + { + "epoch": 0.27284, + "grad_norm": 0.5610307407510313, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 27284 + }, + { + "epoch": 0.27285, + "grad_norm": 0.5490984524416351, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 27285 + }, + { + "epoch": 0.27286, + "grad_norm": 0.5079922041638507, + "learning_rate": 0.003, + "loss": 4.038, + "step": 27286 + }, + { + "epoch": 0.27287, + "grad_norm": 0.48904360437833155, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 27287 + }, + { + "epoch": 0.27288, + "grad_norm": 0.5240502782730533, + "learning_rate": 0.003, + "loss": 4.053, + "step": 27288 + }, + { + "epoch": 0.27289, + "grad_norm": 0.5676918185558094, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 27289 + }, + { + "epoch": 0.2729, + "grad_norm": 0.6649761945634118, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 27290 + }, + { + "epoch": 0.27291, + "grad_norm": 0.7982231627603236, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 27291 + }, + { + "epoch": 0.27292, + "grad_norm": 1.0433353132430405, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 27292 + }, + { + "epoch": 0.27293, + "grad_norm": 1.23411532504232, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 27293 + }, + { + "epoch": 0.27294, + "grad_norm": 0.7377429341283872, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 27294 + }, + { + "epoch": 0.27295, + "grad_norm": 0.7244779781529156, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 27295 + }, + { + "epoch": 0.27296, + "grad_norm": 0.7585274987002648, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 27296 + }, + { + "epoch": 0.27297, + "grad_norm": 0.657764354389836, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 27297 + }, + { + "epoch": 0.27298, + "grad_norm": 0.7360801528413442, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 27298 + }, + { + "epoch": 0.27299, + "grad_norm": 0.7987243617628677, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 27299 + }, + { + "epoch": 0.273, + "grad_norm": 0.727807379384828, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 27300 + }, + { + "epoch": 0.27301, + "grad_norm": 0.7166070650430396, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 27301 + }, + { + "epoch": 0.27302, + "grad_norm": 0.7475273477135493, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 27302 + }, + { + "epoch": 0.27303, + "grad_norm": 0.7599090479520422, + "learning_rate": 0.003, + "loss": 4.0023, + "step": 27303 + }, + { + "epoch": 0.27304, + "grad_norm": 0.9256258141417724, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 27304 + }, + { + "epoch": 0.27305, + "grad_norm": 1.1741459619162797, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 27305 + }, + { + "epoch": 0.27306, + "grad_norm": 1.0576870683499913, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 27306 + }, + { + "epoch": 0.27307, + "grad_norm": 1.0145497847098446, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 27307 + }, + { + "epoch": 0.27308, + "grad_norm": 1.0363750525419229, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 27308 + }, + { + "epoch": 0.27309, + "grad_norm": 0.9267711383795727, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 27309 + }, + { + "epoch": 0.2731, + "grad_norm": 0.8513812949719476, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 27310 + }, + { + "epoch": 0.27311, + "grad_norm": 0.8659207329557144, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 27311 + }, + { + "epoch": 0.27312, + "grad_norm": 0.9125712909286903, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 27312 + }, + { + "epoch": 0.27313, + "grad_norm": 0.9606763461215581, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 27313 + }, + { + "epoch": 0.27314, + "grad_norm": 1.0764123036690407, + "learning_rate": 0.003, + "loss": 4.052, + "step": 27314 + }, + { + "epoch": 0.27315, + "grad_norm": 0.988048159189695, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 27315 + }, + { + "epoch": 0.27316, + "grad_norm": 0.9708720998377749, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 27316 + }, + { + "epoch": 0.27317, + "grad_norm": 1.0867984117775267, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 27317 + }, + { + "epoch": 0.27318, + "grad_norm": 1.126922534519531, + "learning_rate": 0.003, + "loss": 4.062, + "step": 27318 + }, + { + "epoch": 0.27319, + "grad_norm": 0.7750799477371327, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 27319 + }, + { + "epoch": 0.2732, + "grad_norm": 0.7724907870968374, + "learning_rate": 0.003, + "loss": 4.041, + "step": 27320 + }, + { + "epoch": 0.27321, + "grad_norm": 0.7046087914483348, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 27321 + }, + { + "epoch": 0.27322, + "grad_norm": 0.6852655596043664, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 27322 + }, + { + "epoch": 0.27323, + "grad_norm": 0.8033757352943267, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 27323 + }, + { + "epoch": 0.27324, + "grad_norm": 1.109236035538625, + "learning_rate": 0.003, + "loss": 4.044, + "step": 27324 + }, + { + "epoch": 0.27325, + "grad_norm": 1.0395512501729445, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 27325 + }, + { + "epoch": 0.27326, + "grad_norm": 0.9013995133173852, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 27326 + }, + { + "epoch": 0.27327, + "grad_norm": 0.844495590475074, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 27327 + }, + { + "epoch": 0.27328, + "grad_norm": 0.7659898502913669, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 27328 + }, + { + "epoch": 0.27329, + "grad_norm": 0.6924707777748702, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 27329 + }, + { + "epoch": 0.2733, + "grad_norm": 0.6234870128310842, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 27330 + }, + { + "epoch": 0.27331, + "grad_norm": 0.5935626661317915, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 27331 + }, + { + "epoch": 0.27332, + "grad_norm": 0.5866188973320777, + "learning_rate": 0.003, + "loss": 4.04, + "step": 27332 + }, + { + "epoch": 0.27333, + "grad_norm": 0.6328134470851577, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 27333 + }, + { + "epoch": 0.27334, + "grad_norm": 0.6504591690915844, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 27334 + }, + { + "epoch": 0.27335, + "grad_norm": 0.6131369011295167, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 27335 + }, + { + "epoch": 0.27336, + "grad_norm": 0.6134938414995452, + "learning_rate": 0.003, + "loss": 4.0095, + "step": 27336 + }, + { + "epoch": 0.27337, + "grad_norm": 0.6869343387012089, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 27337 + }, + { + "epoch": 0.27338, + "grad_norm": 0.7390715135592301, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 27338 + }, + { + "epoch": 0.27339, + "grad_norm": 0.854398927820215, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 27339 + }, + { + "epoch": 0.2734, + "grad_norm": 1.162037680962209, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 27340 + }, + { + "epoch": 0.27341, + "grad_norm": 1.0222914319522627, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 27341 + }, + { + "epoch": 0.27342, + "grad_norm": 1.1520932743656174, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 27342 + }, + { + "epoch": 0.27343, + "grad_norm": 0.928014530568992, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 27343 + }, + { + "epoch": 0.27344, + "grad_norm": 0.8747890143787175, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 27344 + }, + { + "epoch": 0.27345, + "grad_norm": 0.932464368254461, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 27345 + }, + { + "epoch": 0.27346, + "grad_norm": 0.8375401549403108, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 27346 + }, + { + "epoch": 0.27347, + "grad_norm": 0.8299257008350109, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 27347 + }, + { + "epoch": 0.27348, + "grad_norm": 0.8769839374378462, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 27348 + }, + { + "epoch": 0.27349, + "grad_norm": 0.9116260944959204, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 27349 + }, + { + "epoch": 0.2735, + "grad_norm": 0.8473018241071563, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 27350 + }, + { + "epoch": 0.27351, + "grad_norm": 0.8601695102948744, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 27351 + }, + { + "epoch": 0.27352, + "grad_norm": 0.8872200277474028, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 27352 + }, + { + "epoch": 0.27353, + "grad_norm": 0.9682856190451967, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 27353 + }, + { + "epoch": 0.27354, + "grad_norm": 1.2001561094784348, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 27354 + }, + { + "epoch": 0.27355, + "grad_norm": 0.9040965369455419, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 27355 + }, + { + "epoch": 0.27356, + "grad_norm": 0.8853159728029145, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 27356 + }, + { + "epoch": 0.27357, + "grad_norm": 0.8278343288338695, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 27357 + }, + { + "epoch": 0.27358, + "grad_norm": 0.7412134315406782, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 27358 + }, + { + "epoch": 0.27359, + "grad_norm": 0.795874685838528, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 27359 + }, + { + "epoch": 0.2736, + "grad_norm": 0.8578426132069809, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 27360 + }, + { + "epoch": 0.27361, + "grad_norm": 0.8974415314256757, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 27361 + }, + { + "epoch": 0.27362, + "grad_norm": 1.0447831596074633, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 27362 + }, + { + "epoch": 0.27363, + "grad_norm": 1.0749487593622333, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 27363 + }, + { + "epoch": 0.27364, + "grad_norm": 1.0393737480988867, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 27364 + }, + { + "epoch": 0.27365, + "grad_norm": 1.0361668427685058, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 27365 + }, + { + "epoch": 0.27366, + "grad_norm": 1.0467255116067917, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 27366 + }, + { + "epoch": 0.27367, + "grad_norm": 0.9317374201583356, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 27367 + }, + { + "epoch": 0.27368, + "grad_norm": 0.8158685144532749, + "learning_rate": 0.003, + "loss": 4.049, + "step": 27368 + }, + { + "epoch": 0.27369, + "grad_norm": 0.7442092980490032, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 27369 + }, + { + "epoch": 0.2737, + "grad_norm": 0.8005797589704103, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 27370 + }, + { + "epoch": 0.27371, + "grad_norm": 0.8646964937175462, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 27371 + }, + { + "epoch": 0.27372, + "grad_norm": 0.8805736504632035, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 27372 + }, + { + "epoch": 0.27373, + "grad_norm": 0.933446894596631, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 27373 + }, + { + "epoch": 0.27374, + "grad_norm": 0.9594075699102096, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 27374 + }, + { + "epoch": 0.27375, + "grad_norm": 1.086376487679832, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 27375 + }, + { + "epoch": 0.27376, + "grad_norm": 0.8352728282675678, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 27376 + }, + { + "epoch": 0.27377, + "grad_norm": 0.729801640697119, + "learning_rate": 0.003, + "loss": 4.056, + "step": 27377 + }, + { + "epoch": 0.27378, + "grad_norm": 0.7615618850242499, + "learning_rate": 0.003, + "loss": 4.056, + "step": 27378 + }, + { + "epoch": 0.27379, + "grad_norm": 0.7441213528145411, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 27379 + }, + { + "epoch": 0.2738, + "grad_norm": 0.6394841802367003, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 27380 + }, + { + "epoch": 0.27381, + "grad_norm": 0.688430857103259, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 27381 + }, + { + "epoch": 0.27382, + "grad_norm": 0.6029620243987147, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 27382 + }, + { + "epoch": 0.27383, + "grad_norm": 0.6921292474115627, + "learning_rate": 0.003, + "loss": 4.039, + "step": 27383 + }, + { + "epoch": 0.27384, + "grad_norm": 0.7716864071315577, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 27384 + }, + { + "epoch": 0.27385, + "grad_norm": 0.8137676022361442, + "learning_rate": 0.003, + "loss": 3.9986, + "step": 27385 + }, + { + "epoch": 0.27386, + "grad_norm": 0.7663945832314726, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 27386 + }, + { + "epoch": 0.27387, + "grad_norm": 0.8016130229598967, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 27387 + }, + { + "epoch": 0.27388, + "grad_norm": 0.837626049367869, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 27388 + }, + { + "epoch": 0.27389, + "grad_norm": 0.9224566397645034, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 27389 + }, + { + "epoch": 0.2739, + "grad_norm": 1.089596290370039, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 27390 + }, + { + "epoch": 0.27391, + "grad_norm": 1.1621154766160475, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 27391 + }, + { + "epoch": 0.27392, + "grad_norm": 0.9731353592843491, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 27392 + }, + { + "epoch": 0.27393, + "grad_norm": 0.9779590561639354, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 27393 + }, + { + "epoch": 0.27394, + "grad_norm": 1.0914497742748874, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 27394 + }, + { + "epoch": 0.27395, + "grad_norm": 0.9179233980120709, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 27395 + }, + { + "epoch": 0.27396, + "grad_norm": 0.9095106732159272, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 27396 + }, + { + "epoch": 0.27397, + "grad_norm": 1.0888758215089545, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 27397 + }, + { + "epoch": 0.27398, + "grad_norm": 1.099860403811806, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 27398 + }, + { + "epoch": 0.27399, + "grad_norm": 0.7409531229994823, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 27399 + }, + { + "epoch": 0.274, + "grad_norm": 0.6927064042490775, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 27400 + }, + { + "epoch": 0.27401, + "grad_norm": 0.6907288742401734, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 27401 + }, + { + "epoch": 0.27402, + "grad_norm": 0.7555666391408589, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 27402 + }, + { + "epoch": 0.27403, + "grad_norm": 0.9091445453806137, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 27403 + }, + { + "epoch": 0.27404, + "grad_norm": 1.1411906713898534, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 27404 + }, + { + "epoch": 0.27405, + "grad_norm": 0.7968811950323658, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 27405 + }, + { + "epoch": 0.27406, + "grad_norm": 0.7851245853226373, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 27406 + }, + { + "epoch": 0.27407, + "grad_norm": 0.8569170883906466, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 27407 + }, + { + "epoch": 0.27408, + "grad_norm": 0.9052556250636563, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 27408 + }, + { + "epoch": 0.27409, + "grad_norm": 0.8330541597480289, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 27409 + }, + { + "epoch": 0.2741, + "grad_norm": 0.732094492759006, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 27410 + }, + { + "epoch": 0.27411, + "grad_norm": 0.742547772747988, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 27411 + }, + { + "epoch": 0.27412, + "grad_norm": 0.7378542574442841, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 27412 + }, + { + "epoch": 0.27413, + "grad_norm": 0.7851284151614838, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 27413 + }, + { + "epoch": 0.27414, + "grad_norm": 1.0304719123773676, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 27414 + }, + { + "epoch": 0.27415, + "grad_norm": 1.2207677777607162, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 27415 + }, + { + "epoch": 0.27416, + "grad_norm": 0.8505421595909284, + "learning_rate": 0.003, + "loss": 4.065, + "step": 27416 + }, + { + "epoch": 0.27417, + "grad_norm": 0.8363953408275911, + "learning_rate": 0.003, + "loss": 4.029, + "step": 27417 + }, + { + "epoch": 0.27418, + "grad_norm": 0.905373949766154, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 27418 + }, + { + "epoch": 0.27419, + "grad_norm": 0.896003695432936, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 27419 + }, + { + "epoch": 0.2742, + "grad_norm": 0.8067627195702282, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 27420 + }, + { + "epoch": 0.27421, + "grad_norm": 0.7554626465469829, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 27421 + }, + { + "epoch": 0.27422, + "grad_norm": 0.7335073313249293, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 27422 + }, + { + "epoch": 0.27423, + "grad_norm": 0.7808773093527447, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 27423 + }, + { + "epoch": 0.27424, + "grad_norm": 0.8302458614855671, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 27424 + }, + { + "epoch": 0.27425, + "grad_norm": 1.0029314810599526, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 27425 + }, + { + "epoch": 0.27426, + "grad_norm": 1.3160540850285514, + "learning_rate": 0.003, + "loss": 4.055, + "step": 27426 + }, + { + "epoch": 0.27427, + "grad_norm": 0.8309379346634932, + "learning_rate": 0.003, + "loss": 4.072, + "step": 27427 + }, + { + "epoch": 0.27428, + "grad_norm": 0.813704016976871, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 27428 + }, + { + "epoch": 0.27429, + "grad_norm": 0.8719536254560902, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 27429 + }, + { + "epoch": 0.2743, + "grad_norm": 0.9626717658333871, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 27430 + }, + { + "epoch": 0.27431, + "grad_norm": 1.188537845017463, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 27431 + }, + { + "epoch": 0.27432, + "grad_norm": 0.833775180003765, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 27432 + }, + { + "epoch": 0.27433, + "grad_norm": 0.7038873946107546, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 27433 + }, + { + "epoch": 0.27434, + "grad_norm": 0.6979402822788379, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 27434 + }, + { + "epoch": 0.27435, + "grad_norm": 0.7241907213582798, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 27435 + }, + { + "epoch": 0.27436, + "grad_norm": 0.7733991153300931, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 27436 + }, + { + "epoch": 0.27437, + "grad_norm": 0.7992664317378999, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 27437 + }, + { + "epoch": 0.27438, + "grad_norm": 0.8387029662768279, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 27438 + }, + { + "epoch": 0.27439, + "grad_norm": 0.8699711568884091, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 27439 + }, + { + "epoch": 0.2744, + "grad_norm": 0.7649466351581194, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 27440 + }, + { + "epoch": 0.27441, + "grad_norm": 0.7269121329275984, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 27441 + }, + { + "epoch": 0.27442, + "grad_norm": 0.8942315293940386, + "learning_rate": 0.003, + "loss": 4.047, + "step": 27442 + }, + { + "epoch": 0.27443, + "grad_norm": 1.0521431763550897, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 27443 + }, + { + "epoch": 0.27444, + "grad_norm": 1.122233410381414, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 27444 + }, + { + "epoch": 0.27445, + "grad_norm": 0.8866059190875801, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 27445 + }, + { + "epoch": 0.27446, + "grad_norm": 0.8689669941384291, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 27446 + }, + { + "epoch": 0.27447, + "grad_norm": 0.7137318570806475, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 27447 + }, + { + "epoch": 0.27448, + "grad_norm": 0.7650391889159205, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 27448 + }, + { + "epoch": 0.27449, + "grad_norm": 0.7786522584353585, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 27449 + }, + { + "epoch": 0.2745, + "grad_norm": 0.9099556178809449, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 27450 + }, + { + "epoch": 0.27451, + "grad_norm": 1.265239383200757, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 27451 + }, + { + "epoch": 0.27452, + "grad_norm": 0.9085210312891671, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 27452 + }, + { + "epoch": 0.27453, + "grad_norm": 0.8623858556357957, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 27453 + }, + { + "epoch": 0.27454, + "grad_norm": 0.798148304670742, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 27454 + }, + { + "epoch": 0.27455, + "grad_norm": 0.7619294803156953, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 27455 + }, + { + "epoch": 0.27456, + "grad_norm": 0.7288712740810295, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 27456 + }, + { + "epoch": 0.27457, + "grad_norm": 0.7439268947037079, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 27457 + }, + { + "epoch": 0.27458, + "grad_norm": 0.7473738848777058, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 27458 + }, + { + "epoch": 0.27459, + "grad_norm": 0.8173784568876616, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 27459 + }, + { + "epoch": 0.2746, + "grad_norm": 0.9099098827005555, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 27460 + }, + { + "epoch": 0.27461, + "grad_norm": 0.9607318446307068, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 27461 + }, + { + "epoch": 0.27462, + "grad_norm": 1.007414330472367, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 27462 + }, + { + "epoch": 0.27463, + "grad_norm": 0.9626430689450209, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 27463 + }, + { + "epoch": 0.27464, + "grad_norm": 1.1510292735304786, + "learning_rate": 0.003, + "loss": 4.046, + "step": 27464 + }, + { + "epoch": 0.27465, + "grad_norm": 1.0595244805374822, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 27465 + }, + { + "epoch": 0.27466, + "grad_norm": 1.1421760151141422, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 27466 + }, + { + "epoch": 0.27467, + "grad_norm": 0.8810728019106769, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 27467 + }, + { + "epoch": 0.27468, + "grad_norm": 0.8464373680201983, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 27468 + }, + { + "epoch": 0.27469, + "grad_norm": 0.9057064217496835, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 27469 + }, + { + "epoch": 0.2747, + "grad_norm": 1.017479383988474, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 27470 + }, + { + "epoch": 0.27471, + "grad_norm": 1.0301026811185738, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 27471 + }, + { + "epoch": 0.27472, + "grad_norm": 1.0500518557284348, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 27472 + }, + { + "epoch": 0.27473, + "grad_norm": 0.8434829794212577, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 27473 + }, + { + "epoch": 0.27474, + "grad_norm": 0.8940630212320206, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 27474 + }, + { + "epoch": 0.27475, + "grad_norm": 0.80898127988051, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 27475 + }, + { + "epoch": 0.27476, + "grad_norm": 0.7479332169950791, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 27476 + }, + { + "epoch": 0.27477, + "grad_norm": 0.8079512854645222, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 27477 + }, + { + "epoch": 0.27478, + "grad_norm": 0.8539775038964349, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 27478 + }, + { + "epoch": 0.27479, + "grad_norm": 0.849720706539171, + "learning_rate": 0.003, + "loss": 4.047, + "step": 27479 + }, + { + "epoch": 0.2748, + "grad_norm": 0.956241947569029, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 27480 + }, + { + "epoch": 0.27481, + "grad_norm": 1.054270653923364, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 27481 + }, + { + "epoch": 0.27482, + "grad_norm": 1.0749110221807576, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 27482 + }, + { + "epoch": 0.27483, + "grad_norm": 0.921728764939343, + "learning_rate": 0.003, + "loss": 4.054, + "step": 27483 + }, + { + "epoch": 0.27484, + "grad_norm": 0.920436723382143, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 27484 + }, + { + "epoch": 0.27485, + "grad_norm": 0.7826230725032673, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 27485 + }, + { + "epoch": 0.27486, + "grad_norm": 0.7187615351864852, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 27486 + }, + { + "epoch": 0.27487, + "grad_norm": 0.766182594149174, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 27487 + }, + { + "epoch": 0.27488, + "grad_norm": 0.839585401808998, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 27488 + }, + { + "epoch": 0.27489, + "grad_norm": 0.807975441141832, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 27489 + }, + { + "epoch": 0.2749, + "grad_norm": 0.6721582227795454, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 27490 + }, + { + "epoch": 0.27491, + "grad_norm": 0.8145174706207421, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 27491 + }, + { + "epoch": 0.27492, + "grad_norm": 1.1230711784260712, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 27492 + }, + { + "epoch": 0.27493, + "grad_norm": 1.1832815283361864, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 27493 + }, + { + "epoch": 0.27494, + "grad_norm": 0.9458927462513933, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 27494 + }, + { + "epoch": 0.27495, + "grad_norm": 1.0128338005700075, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 27495 + }, + { + "epoch": 0.27496, + "grad_norm": 0.9553114795826317, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 27496 + }, + { + "epoch": 0.27497, + "grad_norm": 0.924488979917049, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 27497 + }, + { + "epoch": 0.27498, + "grad_norm": 0.9187470908377797, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 27498 + }, + { + "epoch": 0.27499, + "grad_norm": 0.8016636539873732, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 27499 + }, + { + "epoch": 0.275, + "grad_norm": 0.7871270932330456, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 27500 + }, + { + "epoch": 0.27501, + "grad_norm": 0.7539216212731229, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 27501 + }, + { + "epoch": 0.27502, + "grad_norm": 0.6853184314253955, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 27502 + }, + { + "epoch": 0.27503, + "grad_norm": 0.665335695209181, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 27503 + }, + { + "epoch": 0.27504, + "grad_norm": 0.6355608661549148, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 27504 + }, + { + "epoch": 0.27505, + "grad_norm": 0.6549052251639845, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 27505 + }, + { + "epoch": 0.27506, + "grad_norm": 0.6670533595753011, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 27506 + }, + { + "epoch": 0.27507, + "grad_norm": 0.7534124691921559, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 27507 + }, + { + "epoch": 0.27508, + "grad_norm": 0.9707481010121497, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 27508 + }, + { + "epoch": 0.27509, + "grad_norm": 1.0935030766444684, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 27509 + }, + { + "epoch": 0.2751, + "grad_norm": 0.8132493805216545, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 27510 + }, + { + "epoch": 0.27511, + "grad_norm": 0.766723427401172, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 27511 + }, + { + "epoch": 0.27512, + "grad_norm": 0.7671989928210858, + "learning_rate": 0.003, + "loss": 4.028, + "step": 27512 + }, + { + "epoch": 0.27513, + "grad_norm": 0.7408202184440116, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 27513 + }, + { + "epoch": 0.27514, + "grad_norm": 0.5759000077232321, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 27514 + }, + { + "epoch": 0.27515, + "grad_norm": 0.6255602149289347, + "learning_rate": 0.003, + "loss": 4.017, + "step": 27515 + }, + { + "epoch": 0.27516, + "grad_norm": 0.6872200632585111, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 27516 + }, + { + "epoch": 0.27517, + "grad_norm": 0.716969980801242, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 27517 + }, + { + "epoch": 0.27518, + "grad_norm": 0.7024144451198358, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 27518 + }, + { + "epoch": 0.27519, + "grad_norm": 0.7581706269103492, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 27519 + }, + { + "epoch": 0.2752, + "grad_norm": 0.9604720079087541, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 27520 + }, + { + "epoch": 0.27521, + "grad_norm": 1.3114665604836375, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 27521 + }, + { + "epoch": 0.27522, + "grad_norm": 0.8651653085356185, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 27522 + }, + { + "epoch": 0.27523, + "grad_norm": 0.7436616118521834, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 27523 + }, + { + "epoch": 0.27524, + "grad_norm": 0.7171113980226964, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 27524 + }, + { + "epoch": 0.27525, + "grad_norm": 0.6959938546664353, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 27525 + }, + { + "epoch": 0.27526, + "grad_norm": 0.6748813958680602, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 27526 + }, + { + "epoch": 0.27527, + "grad_norm": 0.7286452261243423, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 27527 + }, + { + "epoch": 0.27528, + "grad_norm": 0.7607686912134622, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 27528 + }, + { + "epoch": 0.27529, + "grad_norm": 0.9120006250314171, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 27529 + }, + { + "epoch": 0.2753, + "grad_norm": 1.1407073439280282, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 27530 + }, + { + "epoch": 0.27531, + "grad_norm": 0.9858709349982245, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 27531 + }, + { + "epoch": 0.27532, + "grad_norm": 0.8912552884464429, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 27532 + }, + { + "epoch": 0.27533, + "grad_norm": 0.9936590726987478, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 27533 + }, + { + "epoch": 0.27534, + "grad_norm": 1.0796492381332126, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 27534 + }, + { + "epoch": 0.27535, + "grad_norm": 0.8851203779477849, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 27535 + }, + { + "epoch": 0.27536, + "grad_norm": 0.9126429101858924, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 27536 + }, + { + "epoch": 0.27537, + "grad_norm": 0.9767616157389322, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 27537 + }, + { + "epoch": 0.27538, + "grad_norm": 1.071441070407833, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 27538 + }, + { + "epoch": 0.27539, + "grad_norm": 0.9992441676047388, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 27539 + }, + { + "epoch": 0.2754, + "grad_norm": 1.0135916923891748, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 27540 + }, + { + "epoch": 0.27541, + "grad_norm": 0.8785323301499186, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 27541 + }, + { + "epoch": 0.27542, + "grad_norm": 1.1002852366792761, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 27542 + }, + { + "epoch": 0.27543, + "grad_norm": 1.1528869493065075, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 27543 + }, + { + "epoch": 0.27544, + "grad_norm": 0.938983436433051, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 27544 + }, + { + "epoch": 0.27545, + "grad_norm": 0.9050150829001987, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 27545 + }, + { + "epoch": 0.27546, + "grad_norm": 1.0582811131301542, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 27546 + }, + { + "epoch": 0.27547, + "grad_norm": 1.003134902567243, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 27547 + }, + { + "epoch": 0.27548, + "grad_norm": 0.9018634262899375, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 27548 + }, + { + "epoch": 0.27549, + "grad_norm": 0.891913998853702, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 27549 + }, + { + "epoch": 0.2755, + "grad_norm": 0.8501577059348694, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 27550 + }, + { + "epoch": 0.27551, + "grad_norm": 0.744843275190083, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 27551 + }, + { + "epoch": 0.27552, + "grad_norm": 0.6584018775299888, + "learning_rate": 0.003, + "loss": 4.045, + "step": 27552 + }, + { + "epoch": 0.27553, + "grad_norm": 0.6382722930211042, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 27553 + }, + { + "epoch": 0.27554, + "grad_norm": 0.7032472993967472, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 27554 + }, + { + "epoch": 0.27555, + "grad_norm": 0.7862902123715984, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 27555 + }, + { + "epoch": 0.27556, + "grad_norm": 0.8914170284499748, + "learning_rate": 0.003, + "loss": 4.039, + "step": 27556 + }, + { + "epoch": 0.27557, + "grad_norm": 0.9859594638444603, + "learning_rate": 0.003, + "loss": 4.054, + "step": 27557 + }, + { + "epoch": 0.27558, + "grad_norm": 1.0768995820384843, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 27558 + }, + { + "epoch": 0.27559, + "grad_norm": 0.8836465160824759, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 27559 + }, + { + "epoch": 0.2756, + "grad_norm": 0.7832603604015481, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 27560 + }, + { + "epoch": 0.27561, + "grad_norm": 0.784387326972705, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 27561 + }, + { + "epoch": 0.27562, + "grad_norm": 0.7959746671086947, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 27562 + }, + { + "epoch": 0.27563, + "grad_norm": 0.8595512327817114, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 27563 + }, + { + "epoch": 0.27564, + "grad_norm": 0.9009483677327366, + "learning_rate": 0.003, + "loss": 4.056, + "step": 27564 + }, + { + "epoch": 0.27565, + "grad_norm": 1.138004258610621, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 27565 + }, + { + "epoch": 0.27566, + "grad_norm": 0.8948198221721799, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 27566 + }, + { + "epoch": 0.27567, + "grad_norm": 0.8385289070406653, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 27567 + }, + { + "epoch": 0.27568, + "grad_norm": 0.8779316993920455, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 27568 + }, + { + "epoch": 0.27569, + "grad_norm": 0.841311682968659, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 27569 + }, + { + "epoch": 0.2757, + "grad_norm": 0.8075555550850717, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 27570 + }, + { + "epoch": 0.27571, + "grad_norm": 0.8362522612064287, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 27571 + }, + { + "epoch": 0.27572, + "grad_norm": 0.8646047885843108, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 27572 + }, + { + "epoch": 0.27573, + "grad_norm": 0.9133019398553516, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 27573 + }, + { + "epoch": 0.27574, + "grad_norm": 1.0653767853850598, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 27574 + }, + { + "epoch": 0.27575, + "grad_norm": 0.9465006013101137, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 27575 + }, + { + "epoch": 0.27576, + "grad_norm": 0.7903861011812449, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 27576 + }, + { + "epoch": 0.27577, + "grad_norm": 0.7660488816458991, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 27577 + }, + { + "epoch": 0.27578, + "grad_norm": 0.8318119425186651, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 27578 + }, + { + "epoch": 0.27579, + "grad_norm": 0.8585689264857653, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 27579 + }, + { + "epoch": 0.2758, + "grad_norm": 0.961049629634931, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 27580 + }, + { + "epoch": 0.27581, + "grad_norm": 0.9949296033743659, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 27581 + }, + { + "epoch": 0.27582, + "grad_norm": 1.0856821375700363, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 27582 + }, + { + "epoch": 0.27583, + "grad_norm": 0.9466174818289098, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 27583 + }, + { + "epoch": 0.27584, + "grad_norm": 0.7693535128275024, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 27584 + }, + { + "epoch": 0.27585, + "grad_norm": 0.7882508565649331, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 27585 + }, + { + "epoch": 0.27586, + "grad_norm": 0.7851600824782171, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 27586 + }, + { + "epoch": 0.27587, + "grad_norm": 0.8369271888416528, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 27587 + }, + { + "epoch": 0.27588, + "grad_norm": 0.8761703635546019, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 27588 + }, + { + "epoch": 0.27589, + "grad_norm": 0.8874521755764253, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 27589 + }, + { + "epoch": 0.2759, + "grad_norm": 0.7802661002270506, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 27590 + }, + { + "epoch": 0.27591, + "grad_norm": 0.7504946951370824, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 27591 + }, + { + "epoch": 0.27592, + "grad_norm": 0.8182259858024599, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 27592 + }, + { + "epoch": 0.27593, + "grad_norm": 0.9383115398851327, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 27593 + }, + { + "epoch": 0.27594, + "grad_norm": 1.4007055002007227, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 27594 + }, + { + "epoch": 0.27595, + "grad_norm": 0.8905906054309448, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 27595 + }, + { + "epoch": 0.27596, + "grad_norm": 0.7471650573005397, + "learning_rate": 0.003, + "loss": 4.051, + "step": 27596 + }, + { + "epoch": 0.27597, + "grad_norm": 0.7293564069773956, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 27597 + }, + { + "epoch": 0.27598, + "grad_norm": 0.7604897598040367, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 27598 + }, + { + "epoch": 0.27599, + "grad_norm": 0.6899727928775439, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 27599 + }, + { + "epoch": 0.276, + "grad_norm": 0.7103487343974175, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 27600 + }, + { + "epoch": 0.27601, + "grad_norm": 0.7083491387647609, + "learning_rate": 0.003, + "loss": 4.042, + "step": 27601 + }, + { + "epoch": 0.27602, + "grad_norm": 0.7646286171983344, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 27602 + }, + { + "epoch": 0.27603, + "grad_norm": 0.8028549798392458, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 27603 + }, + { + "epoch": 0.27604, + "grad_norm": 0.636982763190343, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 27604 + }, + { + "epoch": 0.27605, + "grad_norm": 0.6238661683071908, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 27605 + }, + { + "epoch": 0.27606, + "grad_norm": 0.5868082054774105, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 27606 + }, + { + "epoch": 0.27607, + "grad_norm": 0.5850582054056035, + "learning_rate": 0.003, + "loss": 4.0095, + "step": 27607 + }, + { + "epoch": 0.27608, + "grad_norm": 0.6737459546000603, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 27608 + }, + { + "epoch": 0.27609, + "grad_norm": 0.8836877779695854, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 27609 + }, + { + "epoch": 0.2761, + "grad_norm": 1.255141632217191, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 27610 + }, + { + "epoch": 0.27611, + "grad_norm": 1.0925984441037215, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 27611 + }, + { + "epoch": 0.27612, + "grad_norm": 0.960404320336506, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 27612 + }, + { + "epoch": 0.27613, + "grad_norm": 0.9297919515876425, + "learning_rate": 0.003, + "loss": 4.059, + "step": 27613 + }, + { + "epoch": 0.27614, + "grad_norm": 0.9546812145858595, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 27614 + }, + { + "epoch": 0.27615, + "grad_norm": 0.9363572063942333, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 27615 + }, + { + "epoch": 0.27616, + "grad_norm": 0.97198755138276, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 27616 + }, + { + "epoch": 0.27617, + "grad_norm": 0.9544967067312224, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 27617 + }, + { + "epoch": 0.27618, + "grad_norm": 1.0424951225809767, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 27618 + }, + { + "epoch": 0.27619, + "grad_norm": 1.0711359292087823, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 27619 + }, + { + "epoch": 0.2762, + "grad_norm": 0.9463015061121943, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 27620 + }, + { + "epoch": 0.27621, + "grad_norm": 0.8765271848773937, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 27621 + }, + { + "epoch": 0.27622, + "grad_norm": 0.8976222426569188, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 27622 + }, + { + "epoch": 0.27623, + "grad_norm": 0.8528743900902608, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 27623 + }, + { + "epoch": 0.27624, + "grad_norm": 0.8052848319268165, + "learning_rate": 0.003, + "loss": 4.067, + "step": 27624 + }, + { + "epoch": 0.27625, + "grad_norm": 0.7841743375161037, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 27625 + }, + { + "epoch": 0.27626, + "grad_norm": 0.7190315588519719, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 27626 + }, + { + "epoch": 0.27627, + "grad_norm": 0.7902570669206066, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 27627 + }, + { + "epoch": 0.27628, + "grad_norm": 0.9025044881775321, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 27628 + }, + { + "epoch": 0.27629, + "grad_norm": 1.1028916753090239, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 27629 + }, + { + "epoch": 0.2763, + "grad_norm": 1.0281559255193735, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 27630 + }, + { + "epoch": 0.27631, + "grad_norm": 0.9405605180831365, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 27631 + }, + { + "epoch": 0.27632, + "grad_norm": 0.8377842085247145, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 27632 + }, + { + "epoch": 0.27633, + "grad_norm": 0.7614447136375849, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 27633 + }, + { + "epoch": 0.27634, + "grad_norm": 0.8099139958890382, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 27634 + }, + { + "epoch": 0.27635, + "grad_norm": 0.975420920740107, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 27635 + }, + { + "epoch": 0.27636, + "grad_norm": 1.01447184244507, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 27636 + }, + { + "epoch": 0.27637, + "grad_norm": 0.751799628913652, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 27637 + }, + { + "epoch": 0.27638, + "grad_norm": 0.8312255858972802, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 27638 + }, + { + "epoch": 0.27639, + "grad_norm": 0.841837812564942, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 27639 + }, + { + "epoch": 0.2764, + "grad_norm": 0.7894997121324281, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 27640 + }, + { + "epoch": 0.27641, + "grad_norm": 0.9391192122200699, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 27641 + }, + { + "epoch": 0.27642, + "grad_norm": 1.1843979657090105, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 27642 + }, + { + "epoch": 0.27643, + "grad_norm": 1.0197919052750457, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 27643 + }, + { + "epoch": 0.27644, + "grad_norm": 0.962068415634205, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 27644 + }, + { + "epoch": 0.27645, + "grad_norm": 0.8772132090128075, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 27645 + }, + { + "epoch": 0.27646, + "grad_norm": 0.7155344949614276, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 27646 + }, + { + "epoch": 0.27647, + "grad_norm": 0.7175927346953167, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 27647 + }, + { + "epoch": 0.27648, + "grad_norm": 0.7642066023439841, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 27648 + }, + { + "epoch": 0.27649, + "grad_norm": 0.7936469094437929, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 27649 + }, + { + "epoch": 0.2765, + "grad_norm": 0.9550261762873526, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 27650 + }, + { + "epoch": 0.27651, + "grad_norm": 1.1087014569894011, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 27651 + }, + { + "epoch": 0.27652, + "grad_norm": 0.9433103406770217, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 27652 + }, + { + "epoch": 0.27653, + "grad_norm": 0.9808522056359532, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 27653 + }, + { + "epoch": 0.27654, + "grad_norm": 1.0626615940755095, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 27654 + }, + { + "epoch": 0.27655, + "grad_norm": 0.8988477472329935, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 27655 + }, + { + "epoch": 0.27656, + "grad_norm": 0.7022635848675114, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 27656 + }, + { + "epoch": 0.27657, + "grad_norm": 0.6902331602076974, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 27657 + }, + { + "epoch": 0.27658, + "grad_norm": 0.6092234783281616, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 27658 + }, + { + "epoch": 0.27659, + "grad_norm": 0.6245567561604682, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 27659 + }, + { + "epoch": 0.2766, + "grad_norm": 0.6218687775757363, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 27660 + }, + { + "epoch": 0.27661, + "grad_norm": 0.6316146707095361, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 27661 + }, + { + "epoch": 0.27662, + "grad_norm": 0.6280287932510784, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 27662 + }, + { + "epoch": 0.27663, + "grad_norm": 0.6338967256994142, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 27663 + }, + { + "epoch": 0.27664, + "grad_norm": 0.6785475798502305, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 27664 + }, + { + "epoch": 0.27665, + "grad_norm": 0.6058344629552395, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 27665 + }, + { + "epoch": 0.27666, + "grad_norm": 0.55837563696932, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 27666 + }, + { + "epoch": 0.27667, + "grad_norm": 0.7190392896959156, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 27667 + }, + { + "epoch": 0.27668, + "grad_norm": 0.8378771194293715, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 27668 + }, + { + "epoch": 0.27669, + "grad_norm": 0.960676616547277, + "learning_rate": 0.003, + "loss": 4.0033, + "step": 27669 + }, + { + "epoch": 0.2767, + "grad_norm": 1.3467318148807308, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 27670 + }, + { + "epoch": 0.27671, + "grad_norm": 0.889003587592173, + "learning_rate": 0.003, + "loss": 4.017, + "step": 27671 + }, + { + "epoch": 0.27672, + "grad_norm": 0.8499111588801253, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 27672 + }, + { + "epoch": 0.27673, + "grad_norm": 0.8091909401426355, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 27673 + }, + { + "epoch": 0.27674, + "grad_norm": 0.850859091662269, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 27674 + }, + { + "epoch": 0.27675, + "grad_norm": 0.9169133488013691, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 27675 + }, + { + "epoch": 0.27676, + "grad_norm": 0.8726993322853502, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 27676 + }, + { + "epoch": 0.27677, + "grad_norm": 0.955287718402797, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 27677 + }, + { + "epoch": 0.27678, + "grad_norm": 1.141931390174076, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 27678 + }, + { + "epoch": 0.27679, + "grad_norm": 1.0654509741480143, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 27679 + }, + { + "epoch": 0.2768, + "grad_norm": 0.9685232779731515, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 27680 + }, + { + "epoch": 0.27681, + "grad_norm": 1.031978202924167, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 27681 + }, + { + "epoch": 0.27682, + "grad_norm": 1.1047614194958109, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 27682 + }, + { + "epoch": 0.27683, + "grad_norm": 1.006652796326168, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 27683 + }, + { + "epoch": 0.27684, + "grad_norm": 1.0480433535910694, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 27684 + }, + { + "epoch": 0.27685, + "grad_norm": 0.8575494085740141, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 27685 + }, + { + "epoch": 0.27686, + "grad_norm": 0.856129885307255, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 27686 + }, + { + "epoch": 0.27687, + "grad_norm": 0.8717641381324917, + "learning_rate": 0.003, + "loss": 4.032, + "step": 27687 + }, + { + "epoch": 0.27688, + "grad_norm": 0.8462140560921939, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 27688 + }, + { + "epoch": 0.27689, + "grad_norm": 0.8747042504794245, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 27689 + }, + { + "epoch": 0.2769, + "grad_norm": 0.9223154865300004, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 27690 + }, + { + "epoch": 0.27691, + "grad_norm": 0.9151001232505841, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 27691 + }, + { + "epoch": 0.27692, + "grad_norm": 0.8450108608904273, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 27692 + }, + { + "epoch": 0.27693, + "grad_norm": 0.8043145482427533, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 27693 + }, + { + "epoch": 0.27694, + "grad_norm": 0.8709143989284064, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 27694 + }, + { + "epoch": 0.27695, + "grad_norm": 0.9445815193785894, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 27695 + }, + { + "epoch": 0.27696, + "grad_norm": 1.1372197726118844, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 27696 + }, + { + "epoch": 0.27697, + "grad_norm": 1.038594463343923, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 27697 + }, + { + "epoch": 0.27698, + "grad_norm": 1.0990037271560327, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 27698 + }, + { + "epoch": 0.27699, + "grad_norm": 0.8703277416922371, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 27699 + }, + { + "epoch": 0.277, + "grad_norm": 0.7606106360053135, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 27700 + }, + { + "epoch": 0.27701, + "grad_norm": 0.8020572134204552, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 27701 + }, + { + "epoch": 0.27702, + "grad_norm": 0.7665352691454131, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 27702 + }, + { + "epoch": 0.27703, + "grad_norm": 0.7618684419550741, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 27703 + }, + { + "epoch": 0.27704, + "grad_norm": 0.7398081269681952, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 27704 + }, + { + "epoch": 0.27705, + "grad_norm": 0.7300913340101083, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 27705 + }, + { + "epoch": 0.27706, + "grad_norm": 0.8260155155307604, + "learning_rate": 0.003, + "loss": 4.054, + "step": 27706 + }, + { + "epoch": 0.27707, + "grad_norm": 1.0311231762592432, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 27707 + }, + { + "epoch": 0.27708, + "grad_norm": 1.221865800016454, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 27708 + }, + { + "epoch": 0.27709, + "grad_norm": 0.9180289595037172, + "learning_rate": 0.003, + "loss": 4.084, + "step": 27709 + }, + { + "epoch": 0.2771, + "grad_norm": 0.904617250474751, + "learning_rate": 0.003, + "loss": 4.04, + "step": 27710 + }, + { + "epoch": 0.27711, + "grad_norm": 0.8970116532026977, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 27711 + }, + { + "epoch": 0.27712, + "grad_norm": 1.019143393654043, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 27712 + }, + { + "epoch": 0.27713, + "grad_norm": 0.9770369873870997, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 27713 + }, + { + "epoch": 0.27714, + "grad_norm": 0.8917327758615112, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 27714 + }, + { + "epoch": 0.27715, + "grad_norm": 0.8457320913462244, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 27715 + }, + { + "epoch": 0.27716, + "grad_norm": 0.6013831020036178, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 27716 + }, + { + "epoch": 0.27717, + "grad_norm": 0.6744075948075614, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 27717 + }, + { + "epoch": 0.27718, + "grad_norm": 0.6278997933102557, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 27718 + }, + { + "epoch": 0.27719, + "grad_norm": 0.6816057736954539, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 27719 + }, + { + "epoch": 0.2772, + "grad_norm": 0.966792715822161, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 27720 + }, + { + "epoch": 0.27721, + "grad_norm": 1.1875868310405073, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 27721 + }, + { + "epoch": 0.27722, + "grad_norm": 0.762185685795832, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 27722 + }, + { + "epoch": 0.27723, + "grad_norm": 0.7251803898684855, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 27723 + }, + { + "epoch": 0.27724, + "grad_norm": 0.6723420135516522, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 27724 + }, + { + "epoch": 0.27725, + "grad_norm": 0.7282331536703474, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 27725 + }, + { + "epoch": 0.27726, + "grad_norm": 0.7686466279366605, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 27726 + }, + { + "epoch": 0.27727, + "grad_norm": 0.8175316216388241, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 27727 + }, + { + "epoch": 0.27728, + "grad_norm": 0.7615734294839817, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 27728 + }, + { + "epoch": 0.27729, + "grad_norm": 0.8723970541774475, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 27729 + }, + { + "epoch": 0.2773, + "grad_norm": 1.065607010320744, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 27730 + }, + { + "epoch": 0.27731, + "grad_norm": 1.105672580110797, + "learning_rate": 0.003, + "loss": 4.015, + "step": 27731 + }, + { + "epoch": 0.27732, + "grad_norm": 0.9363887759717392, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 27732 + }, + { + "epoch": 0.27733, + "grad_norm": 0.9546807277678666, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 27733 + }, + { + "epoch": 0.27734, + "grad_norm": 0.8131793136043184, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 27734 + }, + { + "epoch": 0.27735, + "grad_norm": 0.7093502584795353, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 27735 + }, + { + "epoch": 0.27736, + "grad_norm": 0.7857209296021429, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 27736 + }, + { + "epoch": 0.27737, + "grad_norm": 0.9755398190558405, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 27737 + }, + { + "epoch": 0.27738, + "grad_norm": 1.2710906573475445, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 27738 + }, + { + "epoch": 0.27739, + "grad_norm": 0.9398051898876182, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 27739 + }, + { + "epoch": 0.2774, + "grad_norm": 0.7677253397404238, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 27740 + }, + { + "epoch": 0.27741, + "grad_norm": 0.6551304795988674, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 27741 + }, + { + "epoch": 0.27742, + "grad_norm": 0.6629172162090192, + "learning_rate": 0.003, + "loss": 4.051, + "step": 27742 + }, + { + "epoch": 0.27743, + "grad_norm": 0.6233407396264823, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 27743 + }, + { + "epoch": 0.27744, + "grad_norm": 0.6415336499626928, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 27744 + }, + { + "epoch": 0.27745, + "grad_norm": 0.681549409725612, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 27745 + }, + { + "epoch": 0.27746, + "grad_norm": 0.7175425422338124, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 27746 + }, + { + "epoch": 0.27747, + "grad_norm": 0.6844464334830769, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 27747 + }, + { + "epoch": 0.27748, + "grad_norm": 0.7088178417616398, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 27748 + }, + { + "epoch": 0.27749, + "grad_norm": 0.749927216909801, + "learning_rate": 0.003, + "loss": 4.041, + "step": 27749 + }, + { + "epoch": 0.2775, + "grad_norm": 0.7320435509355352, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 27750 + }, + { + "epoch": 0.27751, + "grad_norm": 0.8429306860414595, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 27751 + }, + { + "epoch": 0.27752, + "grad_norm": 0.9394660326084713, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 27752 + }, + { + "epoch": 0.27753, + "grad_norm": 1.1115856979340801, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 27753 + }, + { + "epoch": 0.27754, + "grad_norm": 1.0390401652119963, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 27754 + }, + { + "epoch": 0.27755, + "grad_norm": 1.062476839063411, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 27755 + }, + { + "epoch": 0.27756, + "grad_norm": 0.912000103904525, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 27756 + }, + { + "epoch": 0.27757, + "grad_norm": 0.7499241971137838, + "learning_rate": 0.003, + "loss": 4.074, + "step": 27757 + }, + { + "epoch": 0.27758, + "grad_norm": 0.7108256655937034, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 27758 + }, + { + "epoch": 0.27759, + "grad_norm": 0.6601315273993372, + "learning_rate": 0.003, + "loss": 4.036, + "step": 27759 + }, + { + "epoch": 0.2776, + "grad_norm": 0.67463050193368, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 27760 + }, + { + "epoch": 0.27761, + "grad_norm": 0.7338474795899355, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 27761 + }, + { + "epoch": 0.27762, + "grad_norm": 0.8412634728730072, + "learning_rate": 0.003, + "loss": 4.059, + "step": 27762 + }, + { + "epoch": 0.27763, + "grad_norm": 0.9665825384238736, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 27763 + }, + { + "epoch": 0.27764, + "grad_norm": 0.9206432038971117, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 27764 + }, + { + "epoch": 0.27765, + "grad_norm": 0.9605177734202055, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 27765 + }, + { + "epoch": 0.27766, + "grad_norm": 0.9252931676289303, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 27766 + }, + { + "epoch": 0.27767, + "grad_norm": 0.8625364771646512, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 27767 + }, + { + "epoch": 0.27768, + "grad_norm": 0.7473038462076544, + "learning_rate": 0.003, + "loss": 4.047, + "step": 27768 + }, + { + "epoch": 0.27769, + "grad_norm": 0.7457591008070078, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 27769 + }, + { + "epoch": 0.2777, + "grad_norm": 0.7951350826269067, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 27770 + }, + { + "epoch": 0.27771, + "grad_norm": 0.7543435647657978, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 27771 + }, + { + "epoch": 0.27772, + "grad_norm": 0.7990434103741723, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 27772 + }, + { + "epoch": 0.27773, + "grad_norm": 0.8693489091303673, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 27773 + }, + { + "epoch": 0.27774, + "grad_norm": 1.0409523336540192, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 27774 + }, + { + "epoch": 0.27775, + "grad_norm": 1.4820830207602074, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 27775 + }, + { + "epoch": 0.27776, + "grad_norm": 0.6784676114034849, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 27776 + }, + { + "epoch": 0.27777, + "grad_norm": 0.763688648038413, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 27777 + }, + { + "epoch": 0.27778, + "grad_norm": 0.8996288061426083, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 27778 + }, + { + "epoch": 0.27779, + "grad_norm": 0.9989176676255409, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 27779 + }, + { + "epoch": 0.2778, + "grad_norm": 1.2467734365522196, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 27780 + }, + { + "epoch": 0.27781, + "grad_norm": 0.7630680685782786, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 27781 + }, + { + "epoch": 0.27782, + "grad_norm": 0.8057477199246448, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 27782 + }, + { + "epoch": 0.27783, + "grad_norm": 0.9190716881470262, + "learning_rate": 0.003, + "loss": 4.06, + "step": 27783 + }, + { + "epoch": 0.27784, + "grad_norm": 0.8926056038234027, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 27784 + }, + { + "epoch": 0.27785, + "grad_norm": 0.756077275330324, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 27785 + }, + { + "epoch": 0.27786, + "grad_norm": 0.7531080998622893, + "learning_rate": 0.003, + "loss": 4.048, + "step": 27786 + }, + { + "epoch": 0.27787, + "grad_norm": 0.7822311049615184, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 27787 + }, + { + "epoch": 0.27788, + "grad_norm": 0.7807548625921784, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 27788 + }, + { + "epoch": 0.27789, + "grad_norm": 0.7470934696042298, + "learning_rate": 0.003, + "loss": 4.037, + "step": 27789 + }, + { + "epoch": 0.2779, + "grad_norm": 0.7326629568457709, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 27790 + }, + { + "epoch": 0.27791, + "grad_norm": 0.7586532685268151, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 27791 + }, + { + "epoch": 0.27792, + "grad_norm": 0.8208532673893526, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 27792 + }, + { + "epoch": 0.27793, + "grad_norm": 1.0064917332497292, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 27793 + }, + { + "epoch": 0.27794, + "grad_norm": 0.9505555319079347, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 27794 + }, + { + "epoch": 0.27795, + "grad_norm": 1.0600187195667736, + "learning_rate": 0.003, + "loss": 3.9918, + "step": 27795 + }, + { + "epoch": 0.27796, + "grad_norm": 1.306834770203538, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 27796 + }, + { + "epoch": 0.27797, + "grad_norm": 0.881321391732377, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 27797 + }, + { + "epoch": 0.27798, + "grad_norm": 0.8376607861776786, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 27798 + }, + { + "epoch": 0.27799, + "grad_norm": 1.0291808042421235, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 27799 + }, + { + "epoch": 0.278, + "grad_norm": 1.0566389003564205, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 27800 + }, + { + "epoch": 0.27801, + "grad_norm": 0.8224660212800197, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 27801 + }, + { + "epoch": 0.27802, + "grad_norm": 0.7147539245397581, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 27802 + }, + { + "epoch": 0.27803, + "grad_norm": 0.7287328907944992, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 27803 + }, + { + "epoch": 0.27804, + "grad_norm": 0.6625093202171249, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 27804 + }, + { + "epoch": 0.27805, + "grad_norm": 0.6748706920053364, + "learning_rate": 0.003, + "loss": 4.049, + "step": 27805 + }, + { + "epoch": 0.27806, + "grad_norm": 0.7491405117399559, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 27806 + }, + { + "epoch": 0.27807, + "grad_norm": 0.8796120939308266, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 27807 + }, + { + "epoch": 0.27808, + "grad_norm": 0.8704202908653403, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 27808 + }, + { + "epoch": 0.27809, + "grad_norm": 0.9021126517072299, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 27809 + }, + { + "epoch": 0.2781, + "grad_norm": 1.1379939501998537, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 27810 + }, + { + "epoch": 0.27811, + "grad_norm": 1.2033315361788426, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 27811 + }, + { + "epoch": 0.27812, + "grad_norm": 0.9523129418648865, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 27812 + }, + { + "epoch": 0.27813, + "grad_norm": 0.8689210195531076, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 27813 + }, + { + "epoch": 0.27814, + "grad_norm": 0.8790348184112382, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 27814 + }, + { + "epoch": 0.27815, + "grad_norm": 0.8655162310969657, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 27815 + }, + { + "epoch": 0.27816, + "grad_norm": 1.0942776128486744, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 27816 + }, + { + "epoch": 0.27817, + "grad_norm": 1.2697171814000883, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 27817 + }, + { + "epoch": 0.27818, + "grad_norm": 0.7206369328185163, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 27818 + }, + { + "epoch": 0.27819, + "grad_norm": 0.732753271736042, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 27819 + }, + { + "epoch": 0.2782, + "grad_norm": 0.7121884278535716, + "learning_rate": 0.003, + "loss": 4.065, + "step": 27820 + }, + { + "epoch": 0.27821, + "grad_norm": 0.7092440202057201, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 27821 + }, + { + "epoch": 0.27822, + "grad_norm": 0.7172876608902816, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 27822 + }, + { + "epoch": 0.27823, + "grad_norm": 0.7005799500451574, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 27823 + }, + { + "epoch": 0.27824, + "grad_norm": 0.6879468521001236, + "learning_rate": 0.003, + "loss": 4.031, + "step": 27824 + }, + { + "epoch": 0.27825, + "grad_norm": 0.6661186872515242, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 27825 + }, + { + "epoch": 0.27826, + "grad_norm": 0.718285946271353, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 27826 + }, + { + "epoch": 0.27827, + "grad_norm": 0.7633395697862321, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 27827 + }, + { + "epoch": 0.27828, + "grad_norm": 0.906512637957444, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 27828 + }, + { + "epoch": 0.27829, + "grad_norm": 1.17000966656375, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 27829 + }, + { + "epoch": 0.2783, + "grad_norm": 1.0591022360665268, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 27830 + }, + { + "epoch": 0.27831, + "grad_norm": 0.9702132818955198, + "learning_rate": 0.003, + "loss": 4.057, + "step": 27831 + }, + { + "epoch": 0.27832, + "grad_norm": 1.005422100554178, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 27832 + }, + { + "epoch": 0.27833, + "grad_norm": 1.0064103001893616, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 27833 + }, + { + "epoch": 0.27834, + "grad_norm": 0.897844686510108, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 27834 + }, + { + "epoch": 0.27835, + "grad_norm": 0.8772082920356794, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 27835 + }, + { + "epoch": 0.27836, + "grad_norm": 0.8640460615442753, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 27836 + }, + { + "epoch": 0.27837, + "grad_norm": 1.021782082138233, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 27837 + }, + { + "epoch": 0.27838, + "grad_norm": 1.2280121572791725, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 27838 + }, + { + "epoch": 0.27839, + "grad_norm": 0.6790979146120029, + "learning_rate": 0.003, + "loss": 4.053, + "step": 27839 + }, + { + "epoch": 0.2784, + "grad_norm": 0.6779283847040101, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 27840 + }, + { + "epoch": 0.27841, + "grad_norm": 0.6011727576649475, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 27841 + }, + { + "epoch": 0.27842, + "grad_norm": 0.6530577998491776, + "learning_rate": 0.003, + "loss": 4.047, + "step": 27842 + }, + { + "epoch": 0.27843, + "grad_norm": 0.6988211762068519, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 27843 + }, + { + "epoch": 0.27844, + "grad_norm": 0.758140669348051, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 27844 + }, + { + "epoch": 0.27845, + "grad_norm": 0.8706249382858688, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 27845 + }, + { + "epoch": 0.27846, + "grad_norm": 1.0379495941421972, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 27846 + }, + { + "epoch": 0.27847, + "grad_norm": 1.0355749298790293, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 27847 + }, + { + "epoch": 0.27848, + "grad_norm": 0.9324102792948058, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 27848 + }, + { + "epoch": 0.27849, + "grad_norm": 1.0517577001975258, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 27849 + }, + { + "epoch": 0.2785, + "grad_norm": 1.054501901055831, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 27850 + }, + { + "epoch": 0.27851, + "grad_norm": 1.0645454539983148, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 27851 + }, + { + "epoch": 0.27852, + "grad_norm": 0.9128393142462878, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 27852 + }, + { + "epoch": 0.27853, + "grad_norm": 0.8294643471519756, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 27853 + }, + { + "epoch": 0.27854, + "grad_norm": 0.8944895294955064, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 27854 + }, + { + "epoch": 0.27855, + "grad_norm": 0.8404797169195778, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 27855 + }, + { + "epoch": 0.27856, + "grad_norm": 0.8344276714343934, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 27856 + }, + { + "epoch": 0.27857, + "grad_norm": 0.7855922640785303, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 27857 + }, + { + "epoch": 0.27858, + "grad_norm": 0.7527340349076008, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 27858 + }, + { + "epoch": 0.27859, + "grad_norm": 0.7578495181672694, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 27859 + }, + { + "epoch": 0.2786, + "grad_norm": 0.855587951705903, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 27860 + }, + { + "epoch": 0.27861, + "grad_norm": 0.9730719473873491, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 27861 + }, + { + "epoch": 0.27862, + "grad_norm": 1.0024568645874534, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 27862 + }, + { + "epoch": 0.27863, + "grad_norm": 0.9178998012763384, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 27863 + }, + { + "epoch": 0.27864, + "grad_norm": 0.9591931274023741, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 27864 + }, + { + "epoch": 0.27865, + "grad_norm": 0.9568539522302781, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 27865 + }, + { + "epoch": 0.27866, + "grad_norm": 1.104995960455937, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 27866 + }, + { + "epoch": 0.27867, + "grad_norm": 1.0054840841322914, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 27867 + }, + { + "epoch": 0.27868, + "grad_norm": 0.9740717495697179, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 27868 + }, + { + "epoch": 0.27869, + "grad_norm": 0.9480523442023439, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 27869 + }, + { + "epoch": 0.2787, + "grad_norm": 0.8669983741893688, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 27870 + }, + { + "epoch": 0.27871, + "grad_norm": 0.808746021250193, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 27871 + }, + { + "epoch": 0.27872, + "grad_norm": 0.7799212114908738, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 27872 + }, + { + "epoch": 0.27873, + "grad_norm": 0.7857771251657594, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 27873 + }, + { + "epoch": 0.27874, + "grad_norm": 0.8317399428501995, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 27874 + }, + { + "epoch": 0.27875, + "grad_norm": 0.7816429938558018, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 27875 + }, + { + "epoch": 0.27876, + "grad_norm": 0.8848283887244174, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 27876 + }, + { + "epoch": 0.27877, + "grad_norm": 0.8974894467307845, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 27877 + }, + { + "epoch": 0.27878, + "grad_norm": 0.8344496459443077, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 27878 + }, + { + "epoch": 0.27879, + "grad_norm": 0.759009629843913, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 27879 + }, + { + "epoch": 0.2788, + "grad_norm": 0.7771478714984165, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 27880 + }, + { + "epoch": 0.27881, + "grad_norm": 0.8444094214788893, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 27881 + }, + { + "epoch": 0.27882, + "grad_norm": 1.0350118746058892, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 27882 + }, + { + "epoch": 0.27883, + "grad_norm": 1.167097807004749, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 27883 + }, + { + "epoch": 0.27884, + "grad_norm": 0.8220895981337908, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 27884 + }, + { + "epoch": 0.27885, + "grad_norm": 0.7292186866562702, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 27885 + }, + { + "epoch": 0.27886, + "grad_norm": 0.7072295467894064, + "learning_rate": 0.003, + "loss": 4.0027, + "step": 27886 + }, + { + "epoch": 0.27887, + "grad_norm": 0.703643453938188, + "learning_rate": 0.003, + "loss": 4.048, + "step": 27887 + }, + { + "epoch": 0.27888, + "grad_norm": 0.7494843456747895, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 27888 + }, + { + "epoch": 0.27889, + "grad_norm": 0.714232663075034, + "learning_rate": 0.003, + "loss": 4.025, + "step": 27889 + }, + { + "epoch": 0.2789, + "grad_norm": 0.6810545861968759, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 27890 + }, + { + "epoch": 0.27891, + "grad_norm": 0.8046949058406158, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 27891 + }, + { + "epoch": 0.27892, + "grad_norm": 1.0537351057565565, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 27892 + }, + { + "epoch": 0.27893, + "grad_norm": 1.1697799065270287, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 27893 + }, + { + "epoch": 0.27894, + "grad_norm": 0.8597762833522873, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 27894 + }, + { + "epoch": 0.27895, + "grad_norm": 0.7836211790781372, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 27895 + }, + { + "epoch": 0.27896, + "grad_norm": 0.7386413972957119, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 27896 + }, + { + "epoch": 0.27897, + "grad_norm": 0.7559091363977415, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 27897 + }, + { + "epoch": 0.27898, + "grad_norm": 0.8267679791884671, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 27898 + }, + { + "epoch": 0.27899, + "grad_norm": 0.8000158462339704, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 27899 + }, + { + "epoch": 0.279, + "grad_norm": 0.7497717421124886, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 27900 + }, + { + "epoch": 0.27901, + "grad_norm": 0.7147371895301244, + "learning_rate": 0.003, + "loss": 3.9764, + "step": 27901 + }, + { + "epoch": 0.27902, + "grad_norm": 0.7504260965129792, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 27902 + }, + { + "epoch": 0.27903, + "grad_norm": 0.9012233211384367, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 27903 + }, + { + "epoch": 0.27904, + "grad_norm": 1.1271750964681806, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 27904 + }, + { + "epoch": 0.27905, + "grad_norm": 0.8573688268605598, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 27905 + }, + { + "epoch": 0.27906, + "grad_norm": 0.7403019672021436, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 27906 + }, + { + "epoch": 0.27907, + "grad_norm": 0.7838417738312761, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 27907 + }, + { + "epoch": 0.27908, + "grad_norm": 0.8114614317390165, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 27908 + }, + { + "epoch": 0.27909, + "grad_norm": 0.8669116122053429, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 27909 + }, + { + "epoch": 0.2791, + "grad_norm": 0.8283561494730468, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 27910 + }, + { + "epoch": 0.27911, + "grad_norm": 0.8966530565073961, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 27911 + }, + { + "epoch": 0.27912, + "grad_norm": 0.9516518438369279, + "learning_rate": 0.003, + "loss": 4.0134, + "step": 27912 + }, + { + "epoch": 0.27913, + "grad_norm": 0.8684519950690195, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 27913 + }, + { + "epoch": 0.27914, + "grad_norm": 0.986947597296281, + "learning_rate": 0.003, + "loss": 4.065, + "step": 27914 + }, + { + "epoch": 0.27915, + "grad_norm": 1.1799559237577764, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 27915 + }, + { + "epoch": 0.27916, + "grad_norm": 0.7960642730075496, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 27916 + }, + { + "epoch": 0.27917, + "grad_norm": 0.8080462094755548, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 27917 + }, + { + "epoch": 0.27918, + "grad_norm": 0.918123729199587, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 27918 + }, + { + "epoch": 0.27919, + "grad_norm": 1.059007929451071, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 27919 + }, + { + "epoch": 0.2792, + "grad_norm": 0.9750033054095458, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 27920 + }, + { + "epoch": 0.27921, + "grad_norm": 0.9911738349754905, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 27921 + }, + { + "epoch": 0.27922, + "grad_norm": 1.1122616060291601, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 27922 + }, + { + "epoch": 0.27923, + "grad_norm": 0.8941226664357055, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 27923 + }, + { + "epoch": 0.27924, + "grad_norm": 0.8650060418915264, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 27924 + }, + { + "epoch": 0.27925, + "grad_norm": 0.8669245934363925, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 27925 + }, + { + "epoch": 0.27926, + "grad_norm": 0.8484777056862682, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 27926 + }, + { + "epoch": 0.27927, + "grad_norm": 0.858498560942655, + "learning_rate": 0.003, + "loss": 4.054, + "step": 27927 + }, + { + "epoch": 0.27928, + "grad_norm": 0.9410009282145507, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 27928 + }, + { + "epoch": 0.27929, + "grad_norm": 1.2881395282030546, + "learning_rate": 0.003, + "loss": 4.072, + "step": 27929 + }, + { + "epoch": 0.2793, + "grad_norm": 0.9035777330005687, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 27930 + }, + { + "epoch": 0.27931, + "grad_norm": 0.9297881233274283, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 27931 + }, + { + "epoch": 0.27932, + "grad_norm": 0.9251317050320439, + "learning_rate": 0.003, + "loss": 4.072, + "step": 27932 + }, + { + "epoch": 0.27933, + "grad_norm": 1.0032493100785973, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 27933 + }, + { + "epoch": 0.27934, + "grad_norm": 1.0280865586937362, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 27934 + }, + { + "epoch": 0.27935, + "grad_norm": 0.9472871665004493, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 27935 + }, + { + "epoch": 0.27936, + "grad_norm": 0.9793774868430313, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 27936 + }, + { + "epoch": 0.27937, + "grad_norm": 0.8269134075614692, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 27937 + }, + { + "epoch": 0.27938, + "grad_norm": 0.6623674181487251, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 27938 + }, + { + "epoch": 0.27939, + "grad_norm": 0.5946702126676037, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 27939 + }, + { + "epoch": 0.2794, + "grad_norm": 0.5745201811728066, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 27940 + }, + { + "epoch": 0.27941, + "grad_norm": 0.5442978655565623, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 27941 + }, + { + "epoch": 0.27942, + "grad_norm": 0.5993411184433973, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 27942 + }, + { + "epoch": 0.27943, + "grad_norm": 0.5966575564921346, + "learning_rate": 0.003, + "loss": 4.031, + "step": 27943 + }, + { + "epoch": 0.27944, + "grad_norm": 0.6855295628045422, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 27944 + }, + { + "epoch": 0.27945, + "grad_norm": 0.8085027755667794, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 27945 + }, + { + "epoch": 0.27946, + "grad_norm": 0.9629798848073549, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 27946 + }, + { + "epoch": 0.27947, + "grad_norm": 1.2446030032556772, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 27947 + }, + { + "epoch": 0.27948, + "grad_norm": 0.745471033704625, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 27948 + }, + { + "epoch": 0.27949, + "grad_norm": 0.8154684729488978, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 27949 + }, + { + "epoch": 0.2795, + "grad_norm": 0.8525491224328593, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 27950 + }, + { + "epoch": 0.27951, + "grad_norm": 0.8603327279350443, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 27951 + }, + { + "epoch": 0.27952, + "grad_norm": 0.9088975966312394, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 27952 + }, + { + "epoch": 0.27953, + "grad_norm": 0.8036664897271155, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 27953 + }, + { + "epoch": 0.27954, + "grad_norm": 0.8411073925673561, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 27954 + }, + { + "epoch": 0.27955, + "grad_norm": 0.7654120928093795, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 27955 + }, + { + "epoch": 0.27956, + "grad_norm": 0.7851092890440728, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 27956 + }, + { + "epoch": 0.27957, + "grad_norm": 0.9332914179269372, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 27957 + }, + { + "epoch": 0.27958, + "grad_norm": 1.2131963946057815, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 27958 + }, + { + "epoch": 0.27959, + "grad_norm": 0.8694858836383407, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 27959 + }, + { + "epoch": 0.2796, + "grad_norm": 0.7841455344323105, + "learning_rate": 0.003, + "loss": 4.04, + "step": 27960 + }, + { + "epoch": 0.27961, + "grad_norm": 0.7156746192352654, + "learning_rate": 0.003, + "loss": 4.057, + "step": 27961 + }, + { + "epoch": 0.27962, + "grad_norm": 0.7433449407276909, + "learning_rate": 0.003, + "loss": 4.064, + "step": 27962 + }, + { + "epoch": 0.27963, + "grad_norm": 0.8381300417895519, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 27963 + }, + { + "epoch": 0.27964, + "grad_norm": 0.9588249602415594, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 27964 + }, + { + "epoch": 0.27965, + "grad_norm": 0.9877942687618677, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 27965 + }, + { + "epoch": 0.27966, + "grad_norm": 0.8475834498159907, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 27966 + }, + { + "epoch": 0.27967, + "grad_norm": 0.8142775563938072, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 27967 + }, + { + "epoch": 0.27968, + "grad_norm": 0.6921921034946628, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 27968 + }, + { + "epoch": 0.27969, + "grad_norm": 0.5841439697314327, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 27969 + }, + { + "epoch": 0.2797, + "grad_norm": 0.5580503829444775, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 27970 + }, + { + "epoch": 0.27971, + "grad_norm": 0.6555573268844771, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 27971 + }, + { + "epoch": 0.27972, + "grad_norm": 0.7948088913928125, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 27972 + }, + { + "epoch": 0.27973, + "grad_norm": 0.9420387324940408, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 27973 + }, + { + "epoch": 0.27974, + "grad_norm": 1.1721628055930555, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 27974 + }, + { + "epoch": 0.27975, + "grad_norm": 0.8795338698251182, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 27975 + }, + { + "epoch": 0.27976, + "grad_norm": 0.8792674692208562, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 27976 + }, + { + "epoch": 0.27977, + "grad_norm": 0.9975051050597962, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 27977 + }, + { + "epoch": 0.27978, + "grad_norm": 0.9688880546064084, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 27978 + }, + { + "epoch": 0.27979, + "grad_norm": 1.0026211651197914, + "learning_rate": 0.003, + "loss": 4.049, + "step": 27979 + }, + { + "epoch": 0.2798, + "grad_norm": 0.9932254331445153, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 27980 + }, + { + "epoch": 0.27981, + "grad_norm": 1.0276352047310178, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 27981 + }, + { + "epoch": 0.27982, + "grad_norm": 0.9487216749557594, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 27982 + }, + { + "epoch": 0.27983, + "grad_norm": 0.8655200692085009, + "learning_rate": 0.003, + "loss": 4.03, + "step": 27983 + }, + { + "epoch": 0.27984, + "grad_norm": 0.8140856034965781, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 27984 + }, + { + "epoch": 0.27985, + "grad_norm": 0.8279083627366199, + "learning_rate": 0.003, + "loss": 4.028, + "step": 27985 + }, + { + "epoch": 0.27986, + "grad_norm": 1.0088555468106388, + "learning_rate": 0.003, + "loss": 4.07, + "step": 27986 + }, + { + "epoch": 0.27987, + "grad_norm": 1.318976441954303, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 27987 + }, + { + "epoch": 0.27988, + "grad_norm": 0.804546594664036, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 27988 + }, + { + "epoch": 0.27989, + "grad_norm": 0.7651213859258367, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 27989 + }, + { + "epoch": 0.2799, + "grad_norm": 0.8939963391897597, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 27990 + }, + { + "epoch": 0.27991, + "grad_norm": 1.1177666878733, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 27991 + }, + { + "epoch": 0.27992, + "grad_norm": 0.963233675789223, + "learning_rate": 0.003, + "loss": 4.053, + "step": 27992 + }, + { + "epoch": 0.27993, + "grad_norm": 0.7847430097747101, + "learning_rate": 0.003, + "loss": 4.043, + "step": 27993 + }, + { + "epoch": 0.27994, + "grad_norm": 0.6463821898267385, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 27994 + }, + { + "epoch": 0.27995, + "grad_norm": 0.5747246805024196, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 27995 + }, + { + "epoch": 0.27996, + "grad_norm": 0.630157050553995, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 27996 + }, + { + "epoch": 0.27997, + "grad_norm": 0.6993048894990165, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 27997 + }, + { + "epoch": 0.27998, + "grad_norm": 0.7225750856606026, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 27998 + }, + { + "epoch": 0.27999, + "grad_norm": 0.6802690340141045, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 27999 + }, + { + "epoch": 0.28, + "grad_norm": 0.6978310605733629, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 28000 + }, + { + "epoch": 0.28001, + "grad_norm": 0.792401153514491, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 28001 + }, + { + "epoch": 0.28002, + "grad_norm": 0.8818046149496479, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 28002 + }, + { + "epoch": 0.28003, + "grad_norm": 0.9941890534874771, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 28003 + }, + { + "epoch": 0.28004, + "grad_norm": 1.1394295606029359, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 28004 + }, + { + "epoch": 0.28005, + "grad_norm": 1.0372082725485239, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 28005 + }, + { + "epoch": 0.28006, + "grad_norm": 1.0880496023172948, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 28006 + }, + { + "epoch": 0.28007, + "grad_norm": 0.9906354507510764, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 28007 + }, + { + "epoch": 0.28008, + "grad_norm": 1.058160062012444, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 28008 + }, + { + "epoch": 0.28009, + "grad_norm": 0.8322046229994103, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 28009 + }, + { + "epoch": 0.2801, + "grad_norm": 0.7462792892435819, + "learning_rate": 0.003, + "loss": 4.055, + "step": 28010 + }, + { + "epoch": 0.28011, + "grad_norm": 0.6365064239709616, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 28011 + }, + { + "epoch": 0.28012, + "grad_norm": 0.6200562379092251, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 28012 + }, + { + "epoch": 0.28013, + "grad_norm": 0.641220912352142, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 28013 + }, + { + "epoch": 0.28014, + "grad_norm": 0.6464114049469843, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 28014 + }, + { + "epoch": 0.28015, + "grad_norm": 0.7425767896574786, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 28015 + }, + { + "epoch": 0.28016, + "grad_norm": 1.024305623376408, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 28016 + }, + { + "epoch": 0.28017, + "grad_norm": 1.254887444811122, + "learning_rate": 0.003, + "loss": 4.08, + "step": 28017 + }, + { + "epoch": 0.28018, + "grad_norm": 0.792178635742736, + "learning_rate": 0.003, + "loss": 4.052, + "step": 28018 + }, + { + "epoch": 0.28019, + "grad_norm": 0.6883560724226341, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 28019 + }, + { + "epoch": 0.2802, + "grad_norm": 0.6968968012898124, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 28020 + }, + { + "epoch": 0.28021, + "grad_norm": 0.6993532616850903, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 28021 + }, + { + "epoch": 0.28022, + "grad_norm": 0.7043072899634024, + "learning_rate": 0.003, + "loss": 4.052, + "step": 28022 + }, + { + "epoch": 0.28023, + "grad_norm": 0.8563000656902159, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 28023 + }, + { + "epoch": 0.28024, + "grad_norm": 1.147966632861741, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 28024 + }, + { + "epoch": 0.28025, + "grad_norm": 1.0788569084610193, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 28025 + }, + { + "epoch": 0.28026, + "grad_norm": 0.8740509850335905, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 28026 + }, + { + "epoch": 0.28027, + "grad_norm": 0.8379246708998198, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 28027 + }, + { + "epoch": 0.28028, + "grad_norm": 1.0286367206852993, + "learning_rate": 0.003, + "loss": 4.093, + "step": 28028 + }, + { + "epoch": 0.28029, + "grad_norm": 0.9255602027884176, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 28029 + }, + { + "epoch": 0.2803, + "grad_norm": 0.91161934001326, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 28030 + }, + { + "epoch": 0.28031, + "grad_norm": 0.8457986495035507, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 28031 + }, + { + "epoch": 0.28032, + "grad_norm": 0.83961713159867, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 28032 + }, + { + "epoch": 0.28033, + "grad_norm": 1.0918146882887996, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 28033 + }, + { + "epoch": 0.28034, + "grad_norm": 1.2498988903773436, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 28034 + }, + { + "epoch": 0.28035, + "grad_norm": 0.7353889376054008, + "learning_rate": 0.003, + "loss": 4.051, + "step": 28035 + }, + { + "epoch": 0.28036, + "grad_norm": 0.6854077548198501, + "learning_rate": 0.003, + "loss": 4.04, + "step": 28036 + }, + { + "epoch": 0.28037, + "grad_norm": 0.777977529614316, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 28037 + }, + { + "epoch": 0.28038, + "grad_norm": 0.8462242894586985, + "learning_rate": 0.003, + "loss": 4.04, + "step": 28038 + }, + { + "epoch": 0.28039, + "grad_norm": 0.8841290128983104, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 28039 + }, + { + "epoch": 0.2804, + "grad_norm": 0.9987232041674949, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 28040 + }, + { + "epoch": 0.28041, + "grad_norm": 1.1680975591460554, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 28041 + }, + { + "epoch": 0.28042, + "grad_norm": 0.9288047118801263, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 28042 + }, + { + "epoch": 0.28043, + "grad_norm": 0.8197164782621761, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 28043 + }, + { + "epoch": 0.28044, + "grad_norm": 0.6917657643318051, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 28044 + }, + { + "epoch": 0.28045, + "grad_norm": 0.7898174947412675, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 28045 + }, + { + "epoch": 0.28046, + "grad_norm": 0.8028080103708727, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 28046 + }, + { + "epoch": 0.28047, + "grad_norm": 0.897632487293533, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 28047 + }, + { + "epoch": 0.28048, + "grad_norm": 1.012344292707383, + "learning_rate": 0.003, + "loss": 4.045, + "step": 28048 + }, + { + "epoch": 0.28049, + "grad_norm": 1.0490924384944986, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 28049 + }, + { + "epoch": 0.2805, + "grad_norm": 0.9110788096522845, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 28050 + }, + { + "epoch": 0.28051, + "grad_norm": 0.7726600021556684, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 28051 + }, + { + "epoch": 0.28052, + "grad_norm": 0.6617614650477526, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 28052 + }, + { + "epoch": 0.28053, + "grad_norm": 0.7879979558224586, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 28053 + }, + { + "epoch": 0.28054, + "grad_norm": 0.876280217785457, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 28054 + }, + { + "epoch": 0.28055, + "grad_norm": 0.8835865973401837, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 28055 + }, + { + "epoch": 0.28056, + "grad_norm": 0.9411454316014107, + "learning_rate": 0.003, + "loss": 4.069, + "step": 28056 + }, + { + "epoch": 0.28057, + "grad_norm": 0.8976891833258241, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 28057 + }, + { + "epoch": 0.28058, + "grad_norm": 0.7408487822906077, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 28058 + }, + { + "epoch": 0.28059, + "grad_norm": 0.629877688024636, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 28059 + }, + { + "epoch": 0.2806, + "grad_norm": 0.5861682481843686, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 28060 + }, + { + "epoch": 0.28061, + "grad_norm": 0.5705007456171375, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 28061 + }, + { + "epoch": 0.28062, + "grad_norm": 0.5839134571552621, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 28062 + }, + { + "epoch": 0.28063, + "grad_norm": 0.6418554943272836, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 28063 + }, + { + "epoch": 0.28064, + "grad_norm": 0.7667902324827514, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 28064 + }, + { + "epoch": 0.28065, + "grad_norm": 0.8794799413618, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 28065 + }, + { + "epoch": 0.28066, + "grad_norm": 0.8919335689815453, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 28066 + }, + { + "epoch": 0.28067, + "grad_norm": 0.973464464225869, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 28067 + }, + { + "epoch": 0.28068, + "grad_norm": 1.066227042271281, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 28068 + }, + { + "epoch": 0.28069, + "grad_norm": 0.884096687204083, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 28069 + }, + { + "epoch": 0.2807, + "grad_norm": 1.1076768044921277, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 28070 + }, + { + "epoch": 0.28071, + "grad_norm": 1.0747297559220055, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 28071 + }, + { + "epoch": 0.28072, + "grad_norm": 0.9271490189456614, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 28072 + }, + { + "epoch": 0.28073, + "grad_norm": 0.8986805297422971, + "learning_rate": 0.003, + "loss": 4.022, + "step": 28073 + }, + { + "epoch": 0.28074, + "grad_norm": 0.8471311387291113, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 28074 + }, + { + "epoch": 0.28075, + "grad_norm": 0.9196323075723072, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 28075 + }, + { + "epoch": 0.28076, + "grad_norm": 1.21950285579724, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 28076 + }, + { + "epoch": 0.28077, + "grad_norm": 1.0434121487062433, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 28077 + }, + { + "epoch": 0.28078, + "grad_norm": 0.9959024592713192, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 28078 + }, + { + "epoch": 0.28079, + "grad_norm": 1.0036524351507201, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 28079 + }, + { + "epoch": 0.2808, + "grad_norm": 0.9257760523030367, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 28080 + }, + { + "epoch": 0.28081, + "grad_norm": 0.911869163902302, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 28081 + }, + { + "epoch": 0.28082, + "grad_norm": 0.854474045736869, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 28082 + }, + { + "epoch": 0.28083, + "grad_norm": 0.7732663917305774, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 28083 + }, + { + "epoch": 0.28084, + "grad_norm": 0.7982154689464331, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 28084 + }, + { + "epoch": 0.28085, + "grad_norm": 0.7814169607075075, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 28085 + }, + { + "epoch": 0.28086, + "grad_norm": 0.7481340363832668, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 28086 + }, + { + "epoch": 0.28087, + "grad_norm": 0.742342668252692, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 28087 + }, + { + "epoch": 0.28088, + "grad_norm": 0.632289889319823, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 28088 + }, + { + "epoch": 0.28089, + "grad_norm": 0.6420516119624792, + "learning_rate": 0.003, + "loss": 4.049, + "step": 28089 + }, + { + "epoch": 0.2809, + "grad_norm": 0.6065177225097301, + "learning_rate": 0.003, + "loss": 4.02, + "step": 28090 + }, + { + "epoch": 0.28091, + "grad_norm": 0.709820383133188, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 28091 + }, + { + "epoch": 0.28092, + "grad_norm": 0.8682239534479389, + "learning_rate": 0.003, + "loss": 4.081, + "step": 28092 + }, + { + "epoch": 0.28093, + "grad_norm": 1.1145722448807889, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 28093 + }, + { + "epoch": 0.28094, + "grad_norm": 1.0995172327216474, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 28094 + }, + { + "epoch": 0.28095, + "grad_norm": 0.7410414743703411, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 28095 + }, + { + "epoch": 0.28096, + "grad_norm": 0.6726409198350183, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 28096 + }, + { + "epoch": 0.28097, + "grad_norm": 0.7206661348289416, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 28097 + }, + { + "epoch": 0.28098, + "grad_norm": 0.7344826454859487, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 28098 + }, + { + "epoch": 0.28099, + "grad_norm": 0.8029258377630194, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 28099 + }, + { + "epoch": 0.281, + "grad_norm": 0.7625212298518022, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 28100 + }, + { + "epoch": 0.28101, + "grad_norm": 0.7694156933165289, + "learning_rate": 0.003, + "loss": 3.999, + "step": 28101 + }, + { + "epoch": 0.28102, + "grad_norm": 0.8013740263975098, + "learning_rate": 0.003, + "loss": 4.001, + "step": 28102 + }, + { + "epoch": 0.28103, + "grad_norm": 0.7970503553980725, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 28103 + }, + { + "epoch": 0.28104, + "grad_norm": 0.8255993888748747, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 28104 + }, + { + "epoch": 0.28105, + "grad_norm": 0.7573721900358236, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 28105 + }, + { + "epoch": 0.28106, + "grad_norm": 0.6927581237302101, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 28106 + }, + { + "epoch": 0.28107, + "grad_norm": 0.8280779631509771, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 28107 + }, + { + "epoch": 0.28108, + "grad_norm": 0.9654121443579612, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 28108 + }, + { + "epoch": 0.28109, + "grad_norm": 0.9491568637569016, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 28109 + }, + { + "epoch": 0.2811, + "grad_norm": 0.9660593689368192, + "learning_rate": 0.003, + "loss": 4.035, + "step": 28110 + }, + { + "epoch": 0.28111, + "grad_norm": 1.1707432623966816, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 28111 + }, + { + "epoch": 0.28112, + "grad_norm": 0.9824792601050164, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 28112 + }, + { + "epoch": 0.28113, + "grad_norm": 1.1128808368651908, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 28113 + }, + { + "epoch": 0.28114, + "grad_norm": 0.9436636896398909, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 28114 + }, + { + "epoch": 0.28115, + "grad_norm": 0.9298411791672035, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 28115 + }, + { + "epoch": 0.28116, + "grad_norm": 0.8693170146225, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 28116 + }, + { + "epoch": 0.28117, + "grad_norm": 0.9782251863803261, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 28117 + }, + { + "epoch": 0.28118, + "grad_norm": 0.9319861230604518, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 28118 + }, + { + "epoch": 0.28119, + "grad_norm": 0.9771027245549708, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 28119 + }, + { + "epoch": 0.2812, + "grad_norm": 0.9522616902976506, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 28120 + }, + { + "epoch": 0.28121, + "grad_norm": 0.8871586590618997, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 28121 + }, + { + "epoch": 0.28122, + "grad_norm": 0.88123372453046, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 28122 + }, + { + "epoch": 0.28123, + "grad_norm": 1.0241777694491345, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 28123 + }, + { + "epoch": 0.28124, + "grad_norm": 1.0844433168667353, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 28124 + }, + { + "epoch": 0.28125, + "grad_norm": 1.067333950774928, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 28125 + }, + { + "epoch": 0.28126, + "grad_norm": 0.8204318245407727, + "learning_rate": 0.003, + "loss": 4.043, + "step": 28126 + }, + { + "epoch": 0.28127, + "grad_norm": 0.7195118346432481, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 28127 + }, + { + "epoch": 0.28128, + "grad_norm": 0.8820027913409202, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 28128 + }, + { + "epoch": 0.28129, + "grad_norm": 0.9604981629570484, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 28129 + }, + { + "epoch": 0.2813, + "grad_norm": 1.0523989474778024, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 28130 + }, + { + "epoch": 0.28131, + "grad_norm": 0.9194506133545249, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 28131 + }, + { + "epoch": 0.28132, + "grad_norm": 0.8024699801436255, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 28132 + }, + { + "epoch": 0.28133, + "grad_norm": 0.7123743620998565, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 28133 + }, + { + "epoch": 0.28134, + "grad_norm": 0.6403913547271601, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 28134 + }, + { + "epoch": 0.28135, + "grad_norm": 0.6568282637167792, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 28135 + }, + { + "epoch": 0.28136, + "grad_norm": 0.6848772464866574, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 28136 + }, + { + "epoch": 0.28137, + "grad_norm": 0.7033363618862843, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 28137 + }, + { + "epoch": 0.28138, + "grad_norm": 0.6544817485644114, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 28138 + }, + { + "epoch": 0.28139, + "grad_norm": 0.7278087113513992, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 28139 + }, + { + "epoch": 0.2814, + "grad_norm": 0.8315153272810895, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 28140 + }, + { + "epoch": 0.28141, + "grad_norm": 0.9350560687958722, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 28141 + }, + { + "epoch": 0.28142, + "grad_norm": 0.9738023901716829, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 28142 + }, + { + "epoch": 0.28143, + "grad_norm": 0.9664464244538686, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 28143 + }, + { + "epoch": 0.28144, + "grad_norm": 0.9198284352573544, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 28144 + }, + { + "epoch": 0.28145, + "grad_norm": 0.7828724661638736, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 28145 + }, + { + "epoch": 0.28146, + "grad_norm": 0.8358004096559631, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 28146 + }, + { + "epoch": 0.28147, + "grad_norm": 0.9843589798857394, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 28147 + }, + { + "epoch": 0.28148, + "grad_norm": 1.0062359503984668, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 28148 + }, + { + "epoch": 0.28149, + "grad_norm": 1.0323069374152105, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 28149 + }, + { + "epoch": 0.2815, + "grad_norm": 0.9347358269610015, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 28150 + }, + { + "epoch": 0.28151, + "grad_norm": 1.085372950065363, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 28151 + }, + { + "epoch": 0.28152, + "grad_norm": 0.9397812774363618, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 28152 + }, + { + "epoch": 0.28153, + "grad_norm": 0.8734882219180954, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 28153 + }, + { + "epoch": 0.28154, + "grad_norm": 0.7814307375143057, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 28154 + }, + { + "epoch": 0.28155, + "grad_norm": 0.6952285444470059, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 28155 + }, + { + "epoch": 0.28156, + "grad_norm": 0.8397622767134504, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 28156 + }, + { + "epoch": 0.28157, + "grad_norm": 0.8209586092541783, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 28157 + }, + { + "epoch": 0.28158, + "grad_norm": 0.7181336381325689, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 28158 + }, + { + "epoch": 0.28159, + "grad_norm": 0.8604883261568401, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 28159 + }, + { + "epoch": 0.2816, + "grad_norm": 0.964771171487784, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 28160 + }, + { + "epoch": 0.28161, + "grad_norm": 0.869864790671891, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 28161 + }, + { + "epoch": 0.28162, + "grad_norm": 0.8426479989666904, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 28162 + }, + { + "epoch": 0.28163, + "grad_norm": 0.8869618603463989, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 28163 + }, + { + "epoch": 0.28164, + "grad_norm": 0.9151362014883557, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 28164 + }, + { + "epoch": 0.28165, + "grad_norm": 0.9156498708521942, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 28165 + }, + { + "epoch": 0.28166, + "grad_norm": 0.864796014710332, + "learning_rate": 0.003, + "loss": 4.07, + "step": 28166 + }, + { + "epoch": 0.28167, + "grad_norm": 0.815346099767291, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 28167 + }, + { + "epoch": 0.28168, + "grad_norm": 0.8111521259932593, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 28168 + }, + { + "epoch": 0.28169, + "grad_norm": 0.7791251075544184, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 28169 + }, + { + "epoch": 0.2817, + "grad_norm": 0.7531376211438611, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 28170 + }, + { + "epoch": 0.28171, + "grad_norm": 0.7158477732427472, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 28171 + }, + { + "epoch": 0.28172, + "grad_norm": 0.8639409042569773, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 28172 + }, + { + "epoch": 0.28173, + "grad_norm": 1.0584963011450212, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 28173 + }, + { + "epoch": 0.28174, + "grad_norm": 1.1056171315534873, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 28174 + }, + { + "epoch": 0.28175, + "grad_norm": 0.7954393135427869, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 28175 + }, + { + "epoch": 0.28176, + "grad_norm": 0.8117298131021036, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 28176 + }, + { + "epoch": 0.28177, + "grad_norm": 0.8639984696898506, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 28177 + }, + { + "epoch": 0.28178, + "grad_norm": 1.0574065925156941, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 28178 + }, + { + "epoch": 0.28179, + "grad_norm": 1.107785625765744, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 28179 + }, + { + "epoch": 0.2818, + "grad_norm": 0.8713944371064622, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 28180 + }, + { + "epoch": 0.28181, + "grad_norm": 0.8465039011656147, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 28181 + }, + { + "epoch": 0.28182, + "grad_norm": 0.8505639817588961, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 28182 + }, + { + "epoch": 0.28183, + "grad_norm": 0.7935915049053812, + "learning_rate": 0.003, + "loss": 4.021, + "step": 28183 + }, + { + "epoch": 0.28184, + "grad_norm": 0.8767463555296504, + "learning_rate": 0.003, + "loss": 3.9896, + "step": 28184 + }, + { + "epoch": 0.28185, + "grad_norm": 1.0795249011036232, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 28185 + }, + { + "epoch": 0.28186, + "grad_norm": 0.9078598609073928, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 28186 + }, + { + "epoch": 0.28187, + "grad_norm": 0.8464772800526946, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 28187 + }, + { + "epoch": 0.28188, + "grad_norm": 1.05085158654088, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 28188 + }, + { + "epoch": 0.28189, + "grad_norm": 0.9509395939647651, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 28189 + }, + { + "epoch": 0.2819, + "grad_norm": 1.1626826335663998, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 28190 + }, + { + "epoch": 0.28191, + "grad_norm": 0.8639524891972427, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 28191 + }, + { + "epoch": 0.28192, + "grad_norm": 0.8413745019266091, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 28192 + }, + { + "epoch": 0.28193, + "grad_norm": 0.9762164480373203, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 28193 + }, + { + "epoch": 0.28194, + "grad_norm": 1.1505147751752778, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 28194 + }, + { + "epoch": 0.28195, + "grad_norm": 0.8714257123710107, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 28195 + }, + { + "epoch": 0.28196, + "grad_norm": 0.8190312405652552, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 28196 + }, + { + "epoch": 0.28197, + "grad_norm": 0.7083915120832939, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 28197 + }, + { + "epoch": 0.28198, + "grad_norm": 0.7118363356784513, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 28198 + }, + { + "epoch": 0.28199, + "grad_norm": 0.7576966264696798, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 28199 + }, + { + "epoch": 0.282, + "grad_norm": 0.7013751518112019, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 28200 + }, + { + "epoch": 0.28201, + "grad_norm": 0.6924670475129973, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 28201 + }, + { + "epoch": 0.28202, + "grad_norm": 0.6701044164200314, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 28202 + }, + { + "epoch": 0.28203, + "grad_norm": 0.7147755071820038, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 28203 + }, + { + "epoch": 0.28204, + "grad_norm": 0.8550476656120515, + "learning_rate": 0.003, + "loss": 4.057, + "step": 28204 + }, + { + "epoch": 0.28205, + "grad_norm": 0.7821505551737349, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 28205 + }, + { + "epoch": 0.28206, + "grad_norm": 0.8145687099711261, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 28206 + }, + { + "epoch": 0.28207, + "grad_norm": 0.9971051065289025, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 28207 + }, + { + "epoch": 0.28208, + "grad_norm": 1.3170040492329025, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 28208 + }, + { + "epoch": 0.28209, + "grad_norm": 0.6929366740383698, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 28209 + }, + { + "epoch": 0.2821, + "grad_norm": 0.6195448235850514, + "learning_rate": 0.003, + "loss": 3.9976, + "step": 28210 + }, + { + "epoch": 0.28211, + "grad_norm": 0.6703323793680732, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 28211 + }, + { + "epoch": 0.28212, + "grad_norm": 0.7078195913303464, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 28212 + }, + { + "epoch": 0.28213, + "grad_norm": 0.7259258607462531, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 28213 + }, + { + "epoch": 0.28214, + "grad_norm": 0.7306179723273285, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 28214 + }, + { + "epoch": 0.28215, + "grad_norm": 0.8062555283029527, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 28215 + }, + { + "epoch": 0.28216, + "grad_norm": 0.8679362047946177, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 28216 + }, + { + "epoch": 0.28217, + "grad_norm": 0.9991109212290636, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 28217 + }, + { + "epoch": 0.28218, + "grad_norm": 1.103876658031368, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 28218 + }, + { + "epoch": 0.28219, + "grad_norm": 0.9490654348234014, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 28219 + }, + { + "epoch": 0.2822, + "grad_norm": 1.0531165941479594, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 28220 + }, + { + "epoch": 0.28221, + "grad_norm": 0.9663026850558428, + "learning_rate": 0.003, + "loss": 4.056, + "step": 28221 + }, + { + "epoch": 0.28222, + "grad_norm": 0.8305585626675177, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 28222 + }, + { + "epoch": 0.28223, + "grad_norm": 0.7256774548013621, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 28223 + }, + { + "epoch": 0.28224, + "grad_norm": 0.7157274554843289, + "learning_rate": 0.003, + "loss": 4.058, + "step": 28224 + }, + { + "epoch": 0.28225, + "grad_norm": 0.7600991419009422, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 28225 + }, + { + "epoch": 0.28226, + "grad_norm": 0.5672567639339383, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 28226 + }, + { + "epoch": 0.28227, + "grad_norm": 0.6199196468163255, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 28227 + }, + { + "epoch": 0.28228, + "grad_norm": 0.7574550764533049, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 28228 + }, + { + "epoch": 0.28229, + "grad_norm": 0.8869521442159312, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 28229 + }, + { + "epoch": 0.2823, + "grad_norm": 1.1931566139936698, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 28230 + }, + { + "epoch": 0.28231, + "grad_norm": 0.956220342852974, + "learning_rate": 0.003, + "loss": 4.033, + "step": 28231 + }, + { + "epoch": 0.28232, + "grad_norm": 1.0249511084528655, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 28232 + }, + { + "epoch": 0.28233, + "grad_norm": 0.9722567028854423, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 28233 + }, + { + "epoch": 0.28234, + "grad_norm": 0.8704189171986808, + "learning_rate": 0.003, + "loss": 4.051, + "step": 28234 + }, + { + "epoch": 0.28235, + "grad_norm": 0.772673969426097, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 28235 + }, + { + "epoch": 0.28236, + "grad_norm": 0.7172285002598241, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 28236 + }, + { + "epoch": 0.28237, + "grad_norm": 0.7880859565992895, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 28237 + }, + { + "epoch": 0.28238, + "grad_norm": 0.9259399436633314, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 28238 + }, + { + "epoch": 0.28239, + "grad_norm": 0.8752579024245218, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 28239 + }, + { + "epoch": 0.2824, + "grad_norm": 0.8052160750235269, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 28240 + }, + { + "epoch": 0.28241, + "grad_norm": 0.7766720364276999, + "learning_rate": 0.003, + "loss": 4.053, + "step": 28241 + }, + { + "epoch": 0.28242, + "grad_norm": 0.7218784776751477, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 28242 + }, + { + "epoch": 0.28243, + "grad_norm": 0.8005574858540452, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 28243 + }, + { + "epoch": 0.28244, + "grad_norm": 0.8391641274513753, + "learning_rate": 0.003, + "loss": 4.034, + "step": 28244 + }, + { + "epoch": 0.28245, + "grad_norm": 0.9624776705109975, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 28245 + }, + { + "epoch": 0.28246, + "grad_norm": 1.1013495431357332, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 28246 + }, + { + "epoch": 0.28247, + "grad_norm": 0.9141613254983036, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 28247 + }, + { + "epoch": 0.28248, + "grad_norm": 0.9213554421512912, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 28248 + }, + { + "epoch": 0.28249, + "grad_norm": 0.8863299804477717, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 28249 + }, + { + "epoch": 0.2825, + "grad_norm": 0.9017098164315972, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 28250 + }, + { + "epoch": 0.28251, + "grad_norm": 0.965237227832434, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 28251 + }, + { + "epoch": 0.28252, + "grad_norm": 0.9150395331222423, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 28252 + }, + { + "epoch": 0.28253, + "grad_norm": 0.9923713089176346, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 28253 + }, + { + "epoch": 0.28254, + "grad_norm": 1.1396821128647427, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 28254 + }, + { + "epoch": 0.28255, + "grad_norm": 1.0727447613033139, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 28255 + }, + { + "epoch": 0.28256, + "grad_norm": 1.1050199187088274, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 28256 + }, + { + "epoch": 0.28257, + "grad_norm": 0.954500085955581, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 28257 + }, + { + "epoch": 0.28258, + "grad_norm": 0.9814661462062373, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 28258 + }, + { + "epoch": 0.28259, + "grad_norm": 0.8586861676574926, + "learning_rate": 0.003, + "loss": 4.034, + "step": 28259 + }, + { + "epoch": 0.2826, + "grad_norm": 0.8021755481434575, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 28260 + }, + { + "epoch": 0.28261, + "grad_norm": 0.860195487074539, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 28261 + }, + { + "epoch": 0.28262, + "grad_norm": 0.8437195768085451, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 28262 + }, + { + "epoch": 0.28263, + "grad_norm": 0.8412309030797578, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 28263 + }, + { + "epoch": 0.28264, + "grad_norm": 0.8431538632819824, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 28264 + }, + { + "epoch": 0.28265, + "grad_norm": 0.7886352619561732, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 28265 + }, + { + "epoch": 0.28266, + "grad_norm": 0.8273477841989267, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 28266 + }, + { + "epoch": 0.28267, + "grad_norm": 0.8662850287550045, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 28267 + }, + { + "epoch": 0.28268, + "grad_norm": 0.9522480760006031, + "learning_rate": 0.003, + "loss": 4.06, + "step": 28268 + }, + { + "epoch": 0.28269, + "grad_norm": 1.0905538963420323, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 28269 + }, + { + "epoch": 0.2827, + "grad_norm": 0.8752338203179475, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 28270 + }, + { + "epoch": 0.28271, + "grad_norm": 0.8117612669645988, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 28271 + }, + { + "epoch": 0.28272, + "grad_norm": 0.7811353198898225, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 28272 + }, + { + "epoch": 0.28273, + "grad_norm": 0.7178233659034161, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 28273 + }, + { + "epoch": 0.28274, + "grad_norm": 0.6523313412260994, + "learning_rate": 0.003, + "loss": 3.9982, + "step": 28274 + }, + { + "epoch": 0.28275, + "grad_norm": 0.6967378029178256, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 28275 + }, + { + "epoch": 0.28276, + "grad_norm": 0.8548617523019947, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 28276 + }, + { + "epoch": 0.28277, + "grad_norm": 1.141492298654007, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 28277 + }, + { + "epoch": 0.28278, + "grad_norm": 0.9205699478385692, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 28278 + }, + { + "epoch": 0.28279, + "grad_norm": 0.7241236202250745, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 28279 + }, + { + "epoch": 0.2828, + "grad_norm": 0.7427962382256454, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 28280 + }, + { + "epoch": 0.28281, + "grad_norm": 0.8167945233820126, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 28281 + }, + { + "epoch": 0.28282, + "grad_norm": 0.8409863288531024, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 28282 + }, + { + "epoch": 0.28283, + "grad_norm": 0.8137626030495895, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 28283 + }, + { + "epoch": 0.28284, + "grad_norm": 0.9548363661514379, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 28284 + }, + { + "epoch": 0.28285, + "grad_norm": 1.0559393581044432, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 28285 + }, + { + "epoch": 0.28286, + "grad_norm": 0.9710812085799814, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 28286 + }, + { + "epoch": 0.28287, + "grad_norm": 0.8126393619955529, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 28287 + }, + { + "epoch": 0.28288, + "grad_norm": 0.8400615669447016, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 28288 + }, + { + "epoch": 0.28289, + "grad_norm": 0.9667834512644995, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 28289 + }, + { + "epoch": 0.2829, + "grad_norm": 0.9538188587627349, + "learning_rate": 0.003, + "loss": 4.023, + "step": 28290 + }, + { + "epoch": 0.28291, + "grad_norm": 0.9390098114524018, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 28291 + }, + { + "epoch": 0.28292, + "grad_norm": 0.9602841737724106, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 28292 + }, + { + "epoch": 0.28293, + "grad_norm": 0.986154003979366, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 28293 + }, + { + "epoch": 0.28294, + "grad_norm": 0.8705095940201407, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 28294 + }, + { + "epoch": 0.28295, + "grad_norm": 0.8293374347524015, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 28295 + }, + { + "epoch": 0.28296, + "grad_norm": 0.8280917798988214, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 28296 + }, + { + "epoch": 0.28297, + "grad_norm": 0.8716438947973035, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 28297 + }, + { + "epoch": 0.28298, + "grad_norm": 0.9844496754166078, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 28298 + }, + { + "epoch": 0.28299, + "grad_norm": 1.1097938424179985, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 28299 + }, + { + "epoch": 0.283, + "grad_norm": 0.938824106603746, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 28300 + }, + { + "epoch": 0.28301, + "grad_norm": 1.0744371513352338, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 28301 + }, + { + "epoch": 0.28302, + "grad_norm": 0.9635558870327866, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 28302 + }, + { + "epoch": 0.28303, + "grad_norm": 0.7714292338849613, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 28303 + }, + { + "epoch": 0.28304, + "grad_norm": 0.6330090683663562, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 28304 + }, + { + "epoch": 0.28305, + "grad_norm": 0.6467654101024599, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 28305 + }, + { + "epoch": 0.28306, + "grad_norm": 0.6487490601895508, + "learning_rate": 0.003, + "loss": 4.031, + "step": 28306 + }, + { + "epoch": 0.28307, + "grad_norm": 0.7416176037728157, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 28307 + }, + { + "epoch": 0.28308, + "grad_norm": 0.7392811899959272, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 28308 + }, + { + "epoch": 0.28309, + "grad_norm": 0.6184831422020652, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 28309 + }, + { + "epoch": 0.2831, + "grad_norm": 0.5704995144190532, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 28310 + }, + { + "epoch": 0.28311, + "grad_norm": 0.565276742654718, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 28311 + }, + { + "epoch": 0.28312, + "grad_norm": 0.6092218961226599, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 28312 + }, + { + "epoch": 0.28313, + "grad_norm": 0.6239629745158332, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 28313 + }, + { + "epoch": 0.28314, + "grad_norm": 0.6013439743049495, + "learning_rate": 0.003, + "loss": 4.031, + "step": 28314 + }, + { + "epoch": 0.28315, + "grad_norm": 0.5901684515048417, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 28315 + }, + { + "epoch": 0.28316, + "grad_norm": 0.7407785552526108, + "learning_rate": 0.003, + "loss": 4.023, + "step": 28316 + }, + { + "epoch": 0.28317, + "grad_norm": 1.0835280701329584, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 28317 + }, + { + "epoch": 0.28318, + "grad_norm": 1.0547915463961788, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 28318 + }, + { + "epoch": 0.28319, + "grad_norm": 0.8879614999663559, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 28319 + }, + { + "epoch": 0.2832, + "grad_norm": 0.8152934153885669, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 28320 + }, + { + "epoch": 0.28321, + "grad_norm": 0.7455218700188075, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 28321 + }, + { + "epoch": 0.28322, + "grad_norm": 0.7533929298474943, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 28322 + }, + { + "epoch": 0.28323, + "grad_norm": 0.8199852623212297, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 28323 + }, + { + "epoch": 0.28324, + "grad_norm": 0.8535768897844491, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 28324 + }, + { + "epoch": 0.28325, + "grad_norm": 0.8461385332323054, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 28325 + }, + { + "epoch": 0.28326, + "grad_norm": 0.8737690888579969, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 28326 + }, + { + "epoch": 0.28327, + "grad_norm": 1.1527661745195132, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 28327 + }, + { + "epoch": 0.28328, + "grad_norm": 0.9720401268452428, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 28328 + }, + { + "epoch": 0.28329, + "grad_norm": 0.9123470801570708, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 28329 + }, + { + "epoch": 0.2833, + "grad_norm": 1.069056923336479, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 28330 + }, + { + "epoch": 0.28331, + "grad_norm": 1.111773329457238, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 28331 + }, + { + "epoch": 0.28332, + "grad_norm": 0.8694627906320761, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 28332 + }, + { + "epoch": 0.28333, + "grad_norm": 0.8590422866612256, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 28333 + }, + { + "epoch": 0.28334, + "grad_norm": 0.8624363527129874, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 28334 + }, + { + "epoch": 0.28335, + "grad_norm": 0.8164971170824165, + "learning_rate": 0.003, + "loss": 4.017, + "step": 28335 + }, + { + "epoch": 0.28336, + "grad_norm": 0.7178766988621266, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 28336 + }, + { + "epoch": 0.28337, + "grad_norm": 0.894691151745384, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 28337 + }, + { + "epoch": 0.28338, + "grad_norm": 1.143484201048068, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 28338 + }, + { + "epoch": 0.28339, + "grad_norm": 0.9543498556747936, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 28339 + }, + { + "epoch": 0.2834, + "grad_norm": 0.8910555068110552, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 28340 + }, + { + "epoch": 0.28341, + "grad_norm": 0.7990980382175268, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 28341 + }, + { + "epoch": 0.28342, + "grad_norm": 0.7797125301228177, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 28342 + }, + { + "epoch": 0.28343, + "grad_norm": 0.7237847390391007, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 28343 + }, + { + "epoch": 0.28344, + "grad_norm": 0.7458500534930077, + "learning_rate": 0.003, + "loss": 4.051, + "step": 28344 + }, + { + "epoch": 0.28345, + "grad_norm": 0.6574466977527601, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 28345 + }, + { + "epoch": 0.28346, + "grad_norm": 0.6425277044691546, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 28346 + }, + { + "epoch": 0.28347, + "grad_norm": 0.6567406570521138, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 28347 + }, + { + "epoch": 0.28348, + "grad_norm": 0.6397604084409447, + "learning_rate": 0.003, + "loss": 4.026, + "step": 28348 + }, + { + "epoch": 0.28349, + "grad_norm": 0.678991479186897, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 28349 + }, + { + "epoch": 0.2835, + "grad_norm": 0.6580179951546377, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 28350 + }, + { + "epoch": 0.28351, + "grad_norm": 0.6684480985719111, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 28351 + }, + { + "epoch": 0.28352, + "grad_norm": 0.7891825056351334, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 28352 + }, + { + "epoch": 0.28353, + "grad_norm": 1.0495828815160515, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 28353 + }, + { + "epoch": 0.28354, + "grad_norm": 1.3209846672063401, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 28354 + }, + { + "epoch": 0.28355, + "grad_norm": 0.7136545067828034, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 28355 + }, + { + "epoch": 0.28356, + "grad_norm": 0.7744366821603748, + "learning_rate": 0.003, + "loss": 3.996, + "step": 28356 + }, + { + "epoch": 0.28357, + "grad_norm": 0.8365887140756004, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 28357 + }, + { + "epoch": 0.28358, + "grad_norm": 0.9383890589522561, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 28358 + }, + { + "epoch": 0.28359, + "grad_norm": 1.0762483131697052, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 28359 + }, + { + "epoch": 0.2836, + "grad_norm": 0.902178388705875, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 28360 + }, + { + "epoch": 0.28361, + "grad_norm": 0.7989985039848234, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 28361 + }, + { + "epoch": 0.28362, + "grad_norm": 0.9083886401810168, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 28362 + }, + { + "epoch": 0.28363, + "grad_norm": 1.1221242671112592, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 28363 + }, + { + "epoch": 0.28364, + "grad_norm": 1.1566889445859418, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 28364 + }, + { + "epoch": 0.28365, + "grad_norm": 1.0571742698791344, + "learning_rate": 0.003, + "loss": 4.0047, + "step": 28365 + }, + { + "epoch": 0.28366, + "grad_norm": 0.8309173852494756, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 28366 + }, + { + "epoch": 0.28367, + "grad_norm": 0.7683362983834483, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 28367 + }, + { + "epoch": 0.28368, + "grad_norm": 0.9174978562329252, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 28368 + }, + { + "epoch": 0.28369, + "grad_norm": 0.9695409410455902, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 28369 + }, + { + "epoch": 0.2837, + "grad_norm": 0.9456042261756763, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 28370 + }, + { + "epoch": 0.28371, + "grad_norm": 0.9987139301760894, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 28371 + }, + { + "epoch": 0.28372, + "grad_norm": 1.0046369697478696, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 28372 + }, + { + "epoch": 0.28373, + "grad_norm": 1.0994078188059953, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 28373 + }, + { + "epoch": 0.28374, + "grad_norm": 0.9257704762740899, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 28374 + }, + { + "epoch": 0.28375, + "grad_norm": 1.0246734964479787, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 28375 + }, + { + "epoch": 0.28376, + "grad_norm": 1.2431859427491114, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 28376 + }, + { + "epoch": 0.28377, + "grad_norm": 0.9519009927485098, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 28377 + }, + { + "epoch": 0.28378, + "grad_norm": 1.018239886114814, + "learning_rate": 0.003, + "loss": 4.059, + "step": 28378 + }, + { + "epoch": 0.28379, + "grad_norm": 1.1245746836029091, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 28379 + }, + { + "epoch": 0.2838, + "grad_norm": 0.7257416121067121, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 28380 + }, + { + "epoch": 0.28381, + "grad_norm": 0.6320434353914891, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 28381 + }, + { + "epoch": 0.28382, + "grad_norm": 0.753020904458912, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 28382 + }, + { + "epoch": 0.28383, + "grad_norm": 0.7589551961477055, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 28383 + }, + { + "epoch": 0.28384, + "grad_norm": 0.7130353272980438, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 28384 + }, + { + "epoch": 0.28385, + "grad_norm": 0.8514072880594915, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 28385 + }, + { + "epoch": 0.28386, + "grad_norm": 0.9048152898838031, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 28386 + }, + { + "epoch": 0.28387, + "grad_norm": 0.8696735484799988, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 28387 + }, + { + "epoch": 0.28388, + "grad_norm": 0.7677338977373028, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 28388 + }, + { + "epoch": 0.28389, + "grad_norm": 0.7447454685996496, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 28389 + }, + { + "epoch": 0.2839, + "grad_norm": 0.7552100659274816, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 28390 + }, + { + "epoch": 0.28391, + "grad_norm": 0.6248123484740445, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 28391 + }, + { + "epoch": 0.28392, + "grad_norm": 0.6776257314014614, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 28392 + }, + { + "epoch": 0.28393, + "grad_norm": 0.7622657095356815, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 28393 + }, + { + "epoch": 0.28394, + "grad_norm": 0.9814916871219838, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 28394 + }, + { + "epoch": 0.28395, + "grad_norm": 1.1303138220155156, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 28395 + }, + { + "epoch": 0.28396, + "grad_norm": 0.7871564755435672, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 28396 + }, + { + "epoch": 0.28397, + "grad_norm": 0.838521828857985, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 28397 + }, + { + "epoch": 0.28398, + "grad_norm": 0.8681297655485658, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 28398 + }, + { + "epoch": 0.28399, + "grad_norm": 0.8756869478337493, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 28399 + }, + { + "epoch": 0.284, + "grad_norm": 0.9643518966616424, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 28400 + }, + { + "epoch": 0.28401, + "grad_norm": 1.1302456786450705, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 28401 + }, + { + "epoch": 0.28402, + "grad_norm": 0.8666872990194784, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 28402 + }, + { + "epoch": 0.28403, + "grad_norm": 0.832245804372531, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 28403 + }, + { + "epoch": 0.28404, + "grad_norm": 0.8856896240636183, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 28404 + }, + { + "epoch": 0.28405, + "grad_norm": 0.9115384249373091, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 28405 + }, + { + "epoch": 0.28406, + "grad_norm": 0.8725193211098602, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 28406 + }, + { + "epoch": 0.28407, + "grad_norm": 0.8819107892812704, + "learning_rate": 0.003, + "loss": 4.0076, + "step": 28407 + }, + { + "epoch": 0.28408, + "grad_norm": 0.8455360368582739, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 28408 + }, + { + "epoch": 0.28409, + "grad_norm": 0.9698510361406761, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 28409 + }, + { + "epoch": 0.2841, + "grad_norm": 1.07553812050799, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 28410 + }, + { + "epoch": 0.28411, + "grad_norm": 0.9722725243211752, + "learning_rate": 0.003, + "loss": 4.069, + "step": 28411 + }, + { + "epoch": 0.28412, + "grad_norm": 0.8192839057346518, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 28412 + }, + { + "epoch": 0.28413, + "grad_norm": 0.7363819485702107, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 28413 + }, + { + "epoch": 0.28414, + "grad_norm": 0.723644647906333, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 28414 + }, + { + "epoch": 0.28415, + "grad_norm": 0.6395704231769905, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 28415 + }, + { + "epoch": 0.28416, + "grad_norm": 0.6155111558086961, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 28416 + }, + { + "epoch": 0.28417, + "grad_norm": 0.6449462020411405, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 28417 + }, + { + "epoch": 0.28418, + "grad_norm": 0.7329202444339759, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 28418 + }, + { + "epoch": 0.28419, + "grad_norm": 0.8501990402549656, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 28419 + }, + { + "epoch": 0.2842, + "grad_norm": 0.9581448385656717, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 28420 + }, + { + "epoch": 0.28421, + "grad_norm": 1.0191191598691747, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 28421 + }, + { + "epoch": 0.28422, + "grad_norm": 1.041331940603941, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 28422 + }, + { + "epoch": 0.28423, + "grad_norm": 0.9855758585026174, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 28423 + }, + { + "epoch": 0.28424, + "grad_norm": 0.9544220851668925, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 28424 + }, + { + "epoch": 0.28425, + "grad_norm": 0.9550008305004369, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 28425 + }, + { + "epoch": 0.28426, + "grad_norm": 0.9124848880008065, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 28426 + }, + { + "epoch": 0.28427, + "grad_norm": 0.8803979741572936, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 28427 + }, + { + "epoch": 0.28428, + "grad_norm": 0.7432629967963432, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 28428 + }, + { + "epoch": 0.28429, + "grad_norm": 0.7653633826344757, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 28429 + }, + { + "epoch": 0.2843, + "grad_norm": 0.9041638067837511, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 28430 + }, + { + "epoch": 0.28431, + "grad_norm": 0.908361844160174, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 28431 + }, + { + "epoch": 0.28432, + "grad_norm": 0.9150668784272112, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 28432 + }, + { + "epoch": 0.28433, + "grad_norm": 1.1063176773773953, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 28433 + }, + { + "epoch": 0.28434, + "grad_norm": 0.9984726618916955, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 28434 + }, + { + "epoch": 0.28435, + "grad_norm": 0.8926580982598674, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 28435 + }, + { + "epoch": 0.28436, + "grad_norm": 0.8419541420336488, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 28436 + }, + { + "epoch": 0.28437, + "grad_norm": 0.8834821940910866, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 28437 + }, + { + "epoch": 0.28438, + "grad_norm": 0.895215617353213, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 28438 + }, + { + "epoch": 0.28439, + "grad_norm": 0.8920475660278828, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 28439 + }, + { + "epoch": 0.2844, + "grad_norm": 0.8791568520933343, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 28440 + }, + { + "epoch": 0.28441, + "grad_norm": 0.7321723227516553, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 28441 + }, + { + "epoch": 0.28442, + "grad_norm": 0.6664770963616079, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 28442 + }, + { + "epoch": 0.28443, + "grad_norm": 0.6884860828476279, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 28443 + }, + { + "epoch": 0.28444, + "grad_norm": 0.7655128874371786, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 28444 + }, + { + "epoch": 0.28445, + "grad_norm": 0.8723799617328127, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 28445 + }, + { + "epoch": 0.28446, + "grad_norm": 0.9314716918099466, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 28446 + }, + { + "epoch": 0.28447, + "grad_norm": 0.8947768682975082, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 28447 + }, + { + "epoch": 0.28448, + "grad_norm": 0.8231683459094062, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 28448 + }, + { + "epoch": 0.28449, + "grad_norm": 0.73304671125213, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 28449 + }, + { + "epoch": 0.2845, + "grad_norm": 0.7703633254935931, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 28450 + }, + { + "epoch": 0.28451, + "grad_norm": 0.7048151986281054, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 28451 + }, + { + "epoch": 0.28452, + "grad_norm": 0.7136474784723533, + "learning_rate": 0.003, + "loss": 4.021, + "step": 28452 + }, + { + "epoch": 0.28453, + "grad_norm": 0.7750902793466988, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 28453 + }, + { + "epoch": 0.28454, + "grad_norm": 0.872546538204124, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 28454 + }, + { + "epoch": 0.28455, + "grad_norm": 1.1291460484687137, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 28455 + }, + { + "epoch": 0.28456, + "grad_norm": 0.9980106032398537, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 28456 + }, + { + "epoch": 0.28457, + "grad_norm": 0.9105613849000609, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 28457 + }, + { + "epoch": 0.28458, + "grad_norm": 0.7479972195739863, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 28458 + }, + { + "epoch": 0.28459, + "grad_norm": 0.644284996295813, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 28459 + }, + { + "epoch": 0.2846, + "grad_norm": 0.6237223931527431, + "learning_rate": 0.003, + "loss": 4.046, + "step": 28460 + }, + { + "epoch": 0.28461, + "grad_norm": 0.6348820492434564, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 28461 + }, + { + "epoch": 0.28462, + "grad_norm": 0.6213392631291724, + "learning_rate": 0.003, + "loss": 4.0072, + "step": 28462 + }, + { + "epoch": 0.28463, + "grad_norm": 0.6357174779111046, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 28463 + }, + { + "epoch": 0.28464, + "grad_norm": 0.5665670620071153, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 28464 + }, + { + "epoch": 0.28465, + "grad_norm": 0.6161838047603717, + "learning_rate": 0.003, + "loss": 3.9898, + "step": 28465 + }, + { + "epoch": 0.28466, + "grad_norm": 0.7964675794733751, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 28466 + }, + { + "epoch": 0.28467, + "grad_norm": 0.8814555301712172, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 28467 + }, + { + "epoch": 0.28468, + "grad_norm": 0.7596096370030949, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 28468 + }, + { + "epoch": 0.28469, + "grad_norm": 0.8004701181170579, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 28469 + }, + { + "epoch": 0.2847, + "grad_norm": 0.915412304902652, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 28470 + }, + { + "epoch": 0.28471, + "grad_norm": 0.9588664697622711, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 28471 + }, + { + "epoch": 0.28472, + "grad_norm": 0.9430994918592408, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 28472 + }, + { + "epoch": 0.28473, + "grad_norm": 0.9233894079109843, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 28473 + }, + { + "epoch": 0.28474, + "grad_norm": 0.8512179476191793, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 28474 + }, + { + "epoch": 0.28475, + "grad_norm": 0.8581245809766811, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 28475 + }, + { + "epoch": 0.28476, + "grad_norm": 0.9955910625421436, + "learning_rate": 0.003, + "loss": 4.049, + "step": 28476 + }, + { + "epoch": 0.28477, + "grad_norm": 0.9735494175072673, + "learning_rate": 0.003, + "loss": 4.066, + "step": 28477 + }, + { + "epoch": 0.28478, + "grad_norm": 0.8730166403331033, + "learning_rate": 0.003, + "loss": 4.042, + "step": 28478 + }, + { + "epoch": 0.28479, + "grad_norm": 0.8394401101563326, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 28479 + }, + { + "epoch": 0.2848, + "grad_norm": 0.8209927185750898, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 28480 + }, + { + "epoch": 0.28481, + "grad_norm": 0.6898575542034411, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 28481 + }, + { + "epoch": 0.28482, + "grad_norm": 0.6655976153341645, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 28482 + }, + { + "epoch": 0.28483, + "grad_norm": 0.6694593539182925, + "learning_rate": 0.003, + "loss": 4.03, + "step": 28483 + }, + { + "epoch": 0.28484, + "grad_norm": 0.7080972119612121, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 28484 + }, + { + "epoch": 0.28485, + "grad_norm": 0.7409143657379968, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 28485 + }, + { + "epoch": 0.28486, + "grad_norm": 1.0944928294965413, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 28486 + }, + { + "epoch": 0.28487, + "grad_norm": 1.3724663561888903, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 28487 + }, + { + "epoch": 0.28488, + "grad_norm": 0.7315683852221001, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 28488 + }, + { + "epoch": 0.28489, + "grad_norm": 0.7818253941184138, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 28489 + }, + { + "epoch": 0.2849, + "grad_norm": 0.8341947128369528, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 28490 + }, + { + "epoch": 0.28491, + "grad_norm": 0.725608886284378, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 28491 + }, + { + "epoch": 0.28492, + "grad_norm": 0.6613121276164556, + "learning_rate": 0.003, + "loss": 4.046, + "step": 28492 + }, + { + "epoch": 0.28493, + "grad_norm": 0.7222776491795632, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 28493 + }, + { + "epoch": 0.28494, + "grad_norm": 0.7651053271137258, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 28494 + }, + { + "epoch": 0.28495, + "grad_norm": 0.8241155947178603, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 28495 + }, + { + "epoch": 0.28496, + "grad_norm": 0.9300505671250392, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 28496 + }, + { + "epoch": 0.28497, + "grad_norm": 1.197555813064701, + "learning_rate": 0.003, + "loss": 4.059, + "step": 28497 + }, + { + "epoch": 0.28498, + "grad_norm": 0.9496102676826441, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 28498 + }, + { + "epoch": 0.28499, + "grad_norm": 0.8920110158130717, + "learning_rate": 0.003, + "loss": 4.085, + "step": 28499 + }, + { + "epoch": 0.285, + "grad_norm": 1.0094113342480808, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 28500 + }, + { + "epoch": 0.28501, + "grad_norm": 1.3202217504599303, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 28501 + }, + { + "epoch": 0.28502, + "grad_norm": 0.714701401976666, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 28502 + }, + { + "epoch": 0.28503, + "grad_norm": 0.6773396062090474, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 28503 + }, + { + "epoch": 0.28504, + "grad_norm": 0.7034994891811566, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 28504 + }, + { + "epoch": 0.28505, + "grad_norm": 0.7965898897489029, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 28505 + }, + { + "epoch": 0.28506, + "grad_norm": 0.8545325143962685, + "learning_rate": 0.003, + "loss": 4.024, + "step": 28506 + }, + { + "epoch": 0.28507, + "grad_norm": 0.9317711920157477, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 28507 + }, + { + "epoch": 0.28508, + "grad_norm": 1.0434979388102015, + "learning_rate": 0.003, + "loss": 4.05, + "step": 28508 + }, + { + "epoch": 0.28509, + "grad_norm": 1.0218177268241753, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 28509 + }, + { + "epoch": 0.2851, + "grad_norm": 0.9543296747827839, + "learning_rate": 0.003, + "loss": 4.038, + "step": 28510 + }, + { + "epoch": 0.28511, + "grad_norm": 0.9461991779710117, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 28511 + }, + { + "epoch": 0.28512, + "grad_norm": 1.078060467860929, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 28512 + }, + { + "epoch": 0.28513, + "grad_norm": 1.0543500472295795, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 28513 + }, + { + "epoch": 0.28514, + "grad_norm": 0.8399092739365148, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 28514 + }, + { + "epoch": 0.28515, + "grad_norm": 0.7193185953118837, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 28515 + }, + { + "epoch": 0.28516, + "grad_norm": 0.6916142275440277, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 28516 + }, + { + "epoch": 0.28517, + "grad_norm": 0.649379086817287, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 28517 + }, + { + "epoch": 0.28518, + "grad_norm": 0.6712537822577079, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 28518 + }, + { + "epoch": 0.28519, + "grad_norm": 0.699236670919987, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 28519 + }, + { + "epoch": 0.2852, + "grad_norm": 0.7303392710594824, + "learning_rate": 0.003, + "loss": 4.058, + "step": 28520 + }, + { + "epoch": 0.28521, + "grad_norm": 0.854304461741675, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 28521 + }, + { + "epoch": 0.28522, + "grad_norm": 0.9879579694865874, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 28522 + }, + { + "epoch": 0.28523, + "grad_norm": 1.1572212914390956, + "learning_rate": 0.003, + "loss": 4.028, + "step": 28523 + }, + { + "epoch": 0.28524, + "grad_norm": 0.9435352112026604, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 28524 + }, + { + "epoch": 0.28525, + "grad_norm": 0.8135650918046783, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 28525 + }, + { + "epoch": 0.28526, + "grad_norm": 0.5954292318616392, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 28526 + }, + { + "epoch": 0.28527, + "grad_norm": 0.5550034604070853, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 28527 + }, + { + "epoch": 0.28528, + "grad_norm": 0.5823415912901907, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 28528 + }, + { + "epoch": 0.28529, + "grad_norm": 0.69477055144928, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 28529 + }, + { + "epoch": 0.2853, + "grad_norm": 0.8610315300373312, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 28530 + }, + { + "epoch": 0.28531, + "grad_norm": 0.9474318016661692, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 28531 + }, + { + "epoch": 0.28532, + "grad_norm": 0.9545779027009803, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 28532 + }, + { + "epoch": 0.28533, + "grad_norm": 1.0388622216645689, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 28533 + }, + { + "epoch": 0.28534, + "grad_norm": 1.0689835997213255, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 28534 + }, + { + "epoch": 0.28535, + "grad_norm": 0.8960524614549431, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 28535 + }, + { + "epoch": 0.28536, + "grad_norm": 0.8710658925915614, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 28536 + }, + { + "epoch": 0.28537, + "grad_norm": 0.8686052293673259, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 28537 + }, + { + "epoch": 0.28538, + "grad_norm": 0.9228534648871559, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 28538 + }, + { + "epoch": 0.28539, + "grad_norm": 0.8042315973586255, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 28539 + }, + { + "epoch": 0.2854, + "grad_norm": 0.7959157240856279, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 28540 + }, + { + "epoch": 0.28541, + "grad_norm": 0.8606723491854665, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 28541 + }, + { + "epoch": 0.28542, + "grad_norm": 1.0683246095857424, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 28542 + }, + { + "epoch": 0.28543, + "grad_norm": 0.9758308279913178, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 28543 + }, + { + "epoch": 0.28544, + "grad_norm": 1.122155327128848, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 28544 + }, + { + "epoch": 0.28545, + "grad_norm": 0.9750212439586787, + "learning_rate": 0.003, + "loss": 4.054, + "step": 28545 + }, + { + "epoch": 0.28546, + "grad_norm": 1.015453367392447, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 28546 + }, + { + "epoch": 0.28547, + "grad_norm": 0.7229826685672138, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 28547 + }, + { + "epoch": 0.28548, + "grad_norm": 0.6207732667918442, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 28548 + }, + { + "epoch": 0.28549, + "grad_norm": 0.6833740728352132, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 28549 + }, + { + "epoch": 0.2855, + "grad_norm": 0.8193464028543707, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 28550 + }, + { + "epoch": 0.28551, + "grad_norm": 0.9663075532351895, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 28551 + }, + { + "epoch": 0.28552, + "grad_norm": 1.116074638181051, + "learning_rate": 0.003, + "loss": 4.068, + "step": 28552 + }, + { + "epoch": 0.28553, + "grad_norm": 1.1182803694070165, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 28553 + }, + { + "epoch": 0.28554, + "grad_norm": 0.8670533934733222, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 28554 + }, + { + "epoch": 0.28555, + "grad_norm": 0.7196127856963327, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 28555 + }, + { + "epoch": 0.28556, + "grad_norm": 0.7661445983433341, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 28556 + }, + { + "epoch": 0.28557, + "grad_norm": 0.8298300962898633, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 28557 + }, + { + "epoch": 0.28558, + "grad_norm": 0.8582227660318542, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 28558 + }, + { + "epoch": 0.28559, + "grad_norm": 0.8062986892552619, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 28559 + }, + { + "epoch": 0.2856, + "grad_norm": 0.9717933107357863, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 28560 + }, + { + "epoch": 0.28561, + "grad_norm": 1.1986161870703396, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 28561 + }, + { + "epoch": 0.28562, + "grad_norm": 0.9836828523483168, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 28562 + }, + { + "epoch": 0.28563, + "grad_norm": 0.9435272210316958, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 28563 + }, + { + "epoch": 0.28564, + "grad_norm": 0.920647301952589, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 28564 + }, + { + "epoch": 0.28565, + "grad_norm": 0.9271613864532717, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 28565 + }, + { + "epoch": 0.28566, + "grad_norm": 0.8921556479899426, + "learning_rate": 0.003, + "loss": 4.03, + "step": 28566 + }, + { + "epoch": 0.28567, + "grad_norm": 0.7857607501125209, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 28567 + }, + { + "epoch": 0.28568, + "grad_norm": 0.8438862192460418, + "learning_rate": 0.003, + "loss": 4.058, + "step": 28568 + }, + { + "epoch": 0.28569, + "grad_norm": 0.8807621010920424, + "learning_rate": 0.003, + "loss": 4.0051, + "step": 28569 + }, + { + "epoch": 0.2857, + "grad_norm": 0.8931263682673422, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 28570 + }, + { + "epoch": 0.28571, + "grad_norm": 0.8330816110416649, + "learning_rate": 0.003, + "loss": 4.069, + "step": 28571 + }, + { + "epoch": 0.28572, + "grad_norm": 0.8753928920061944, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 28572 + }, + { + "epoch": 0.28573, + "grad_norm": 1.063417726032065, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 28573 + }, + { + "epoch": 0.28574, + "grad_norm": 0.9913479555119775, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 28574 + }, + { + "epoch": 0.28575, + "grad_norm": 0.9321792951268351, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 28575 + }, + { + "epoch": 0.28576, + "grad_norm": 0.877538111941257, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 28576 + }, + { + "epoch": 0.28577, + "grad_norm": 0.8808790612965013, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 28577 + }, + { + "epoch": 0.28578, + "grad_norm": 0.9720485046907056, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 28578 + }, + { + "epoch": 0.28579, + "grad_norm": 1.0955368974720248, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 28579 + }, + { + "epoch": 0.2858, + "grad_norm": 0.9721233750913911, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 28580 + }, + { + "epoch": 0.28581, + "grad_norm": 0.7744423709794399, + "learning_rate": 0.003, + "loss": 4.017, + "step": 28581 + }, + { + "epoch": 0.28582, + "grad_norm": 0.7837454356095888, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 28582 + }, + { + "epoch": 0.28583, + "grad_norm": 0.834694152480912, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 28583 + }, + { + "epoch": 0.28584, + "grad_norm": 0.8257888028663101, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 28584 + }, + { + "epoch": 0.28585, + "grad_norm": 0.8028856031538308, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 28585 + }, + { + "epoch": 0.28586, + "grad_norm": 0.7632926029396118, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 28586 + }, + { + "epoch": 0.28587, + "grad_norm": 0.765855807489956, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 28587 + }, + { + "epoch": 0.28588, + "grad_norm": 0.8003726641525769, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 28588 + }, + { + "epoch": 0.28589, + "grad_norm": 0.8119539242130261, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 28589 + }, + { + "epoch": 0.2859, + "grad_norm": 0.8543732068372208, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 28590 + }, + { + "epoch": 0.28591, + "grad_norm": 0.8966118337674691, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 28591 + }, + { + "epoch": 0.28592, + "grad_norm": 0.8583802156790714, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 28592 + }, + { + "epoch": 0.28593, + "grad_norm": 0.8114882664944365, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 28593 + }, + { + "epoch": 0.28594, + "grad_norm": 0.8680560360455098, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 28594 + }, + { + "epoch": 0.28595, + "grad_norm": 0.9149883128882348, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 28595 + }, + { + "epoch": 0.28596, + "grad_norm": 0.7677318609263302, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 28596 + }, + { + "epoch": 0.28597, + "grad_norm": 0.6630941055664152, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 28597 + }, + { + "epoch": 0.28598, + "grad_norm": 0.6380367561684493, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 28598 + }, + { + "epoch": 0.28599, + "grad_norm": 0.6046570836883874, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 28599 + }, + { + "epoch": 0.286, + "grad_norm": 0.76722479702395, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 28600 + }, + { + "epoch": 0.28601, + "grad_norm": 1.0413249864512195, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 28601 + }, + { + "epoch": 0.28602, + "grad_norm": 1.3718316146930083, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 28602 + }, + { + "epoch": 0.28603, + "grad_norm": 0.647513863970598, + "learning_rate": 0.003, + "loss": 4.044, + "step": 28603 + }, + { + "epoch": 0.28604, + "grad_norm": 0.6694479910393238, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 28604 + }, + { + "epoch": 0.28605, + "grad_norm": 0.7162310994372082, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 28605 + }, + { + "epoch": 0.28606, + "grad_norm": 0.7907453917272956, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 28606 + }, + { + "epoch": 0.28607, + "grad_norm": 0.9015311342643925, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 28607 + }, + { + "epoch": 0.28608, + "grad_norm": 0.9565272242600616, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 28608 + }, + { + "epoch": 0.28609, + "grad_norm": 0.9015540131307266, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 28609 + }, + { + "epoch": 0.2861, + "grad_norm": 0.8197512709206659, + "learning_rate": 0.003, + "loss": 4.058, + "step": 28610 + }, + { + "epoch": 0.28611, + "grad_norm": 0.7474938293075504, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 28611 + }, + { + "epoch": 0.28612, + "grad_norm": 0.8078006431592246, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 28612 + }, + { + "epoch": 0.28613, + "grad_norm": 0.776324046700105, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 28613 + }, + { + "epoch": 0.28614, + "grad_norm": 0.8021374395950123, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 28614 + }, + { + "epoch": 0.28615, + "grad_norm": 1.0001348768999727, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 28615 + }, + { + "epoch": 0.28616, + "grad_norm": 1.224230843035204, + "learning_rate": 0.003, + "loss": 3.9937, + "step": 28616 + }, + { + "epoch": 0.28617, + "grad_norm": 0.7579693786774793, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 28617 + }, + { + "epoch": 0.28618, + "grad_norm": 0.6532911366541334, + "learning_rate": 0.003, + "loss": 4.036, + "step": 28618 + }, + { + "epoch": 0.28619, + "grad_norm": 0.607815815802335, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 28619 + }, + { + "epoch": 0.2862, + "grad_norm": 0.589361678870917, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 28620 + }, + { + "epoch": 0.28621, + "grad_norm": 0.6785879696406384, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 28621 + }, + { + "epoch": 0.28622, + "grad_norm": 0.7637479860788927, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 28622 + }, + { + "epoch": 0.28623, + "grad_norm": 0.8794926946523457, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 28623 + }, + { + "epoch": 0.28624, + "grad_norm": 0.9969838901943914, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 28624 + }, + { + "epoch": 0.28625, + "grad_norm": 1.327868933509164, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 28625 + }, + { + "epoch": 0.28626, + "grad_norm": 0.7638980944749479, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 28626 + }, + { + "epoch": 0.28627, + "grad_norm": 0.8072625180813008, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 28627 + }, + { + "epoch": 0.28628, + "grad_norm": 0.8667211698176158, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 28628 + }, + { + "epoch": 0.28629, + "grad_norm": 1.067362063753034, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 28629 + }, + { + "epoch": 0.2863, + "grad_norm": 0.9901283746064967, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 28630 + }, + { + "epoch": 0.28631, + "grad_norm": 0.8102771853845896, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 28631 + }, + { + "epoch": 0.28632, + "grad_norm": 0.721959790454203, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 28632 + }, + { + "epoch": 0.28633, + "grad_norm": 0.7532524916197352, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 28633 + }, + { + "epoch": 0.28634, + "grad_norm": 0.8044392442393213, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 28634 + }, + { + "epoch": 0.28635, + "grad_norm": 0.7483603571713031, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 28635 + }, + { + "epoch": 0.28636, + "grad_norm": 0.7298254490320449, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 28636 + }, + { + "epoch": 0.28637, + "grad_norm": 0.7614227112349682, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 28637 + }, + { + "epoch": 0.28638, + "grad_norm": 0.9044979061465064, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 28638 + }, + { + "epoch": 0.28639, + "grad_norm": 1.0577665845310897, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 28639 + }, + { + "epoch": 0.2864, + "grad_norm": 1.1633235569633706, + "learning_rate": 0.003, + "loss": 4.04, + "step": 28640 + }, + { + "epoch": 0.28641, + "grad_norm": 0.8753405851494542, + "learning_rate": 0.003, + "loss": 4.053, + "step": 28641 + }, + { + "epoch": 0.28642, + "grad_norm": 0.9183848659716629, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 28642 + }, + { + "epoch": 0.28643, + "grad_norm": 0.8418929349304822, + "learning_rate": 0.003, + "loss": 4.029, + "step": 28643 + }, + { + "epoch": 0.28644, + "grad_norm": 0.8033380734837967, + "learning_rate": 0.003, + "loss": 4.05, + "step": 28644 + }, + { + "epoch": 0.28645, + "grad_norm": 0.8029906141253343, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 28645 + }, + { + "epoch": 0.28646, + "grad_norm": 0.7514034302201833, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 28646 + }, + { + "epoch": 0.28647, + "grad_norm": 0.8380963080702498, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 28647 + }, + { + "epoch": 0.28648, + "grad_norm": 0.9552567650120333, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 28648 + }, + { + "epoch": 0.28649, + "grad_norm": 1.0962255447468026, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 28649 + }, + { + "epoch": 0.2865, + "grad_norm": 1.1013136835734934, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 28650 + }, + { + "epoch": 0.28651, + "grad_norm": 1.0179291792508771, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 28651 + }, + { + "epoch": 0.28652, + "grad_norm": 0.9057586051898991, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 28652 + }, + { + "epoch": 0.28653, + "grad_norm": 0.7950679078148759, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 28653 + }, + { + "epoch": 0.28654, + "grad_norm": 0.7555567030866315, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 28654 + }, + { + "epoch": 0.28655, + "grad_norm": 0.759306410472044, + "learning_rate": 0.003, + "loss": 4.014, + "step": 28655 + }, + { + "epoch": 0.28656, + "grad_norm": 0.7461651297571248, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 28656 + }, + { + "epoch": 0.28657, + "grad_norm": 0.7297003445907126, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 28657 + }, + { + "epoch": 0.28658, + "grad_norm": 0.8060513672962927, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 28658 + }, + { + "epoch": 0.28659, + "grad_norm": 0.9370072654042834, + "learning_rate": 0.003, + "loss": 4.059, + "step": 28659 + }, + { + "epoch": 0.2866, + "grad_norm": 1.0319254570205272, + "learning_rate": 0.003, + "loss": 3.9939, + "step": 28660 + }, + { + "epoch": 0.28661, + "grad_norm": 0.959529483172278, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 28661 + }, + { + "epoch": 0.28662, + "grad_norm": 0.9049348164694172, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 28662 + }, + { + "epoch": 0.28663, + "grad_norm": 0.9407665629700497, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 28663 + }, + { + "epoch": 0.28664, + "grad_norm": 1.1415550483819394, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 28664 + }, + { + "epoch": 0.28665, + "grad_norm": 1.0271410783947623, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 28665 + }, + { + "epoch": 0.28666, + "grad_norm": 1.0855898539370348, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 28666 + }, + { + "epoch": 0.28667, + "grad_norm": 0.9559186613458154, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 28667 + }, + { + "epoch": 0.28668, + "grad_norm": 0.8321087329983451, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 28668 + }, + { + "epoch": 0.28669, + "grad_norm": 0.81674593971458, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 28669 + }, + { + "epoch": 0.2867, + "grad_norm": 0.898659393278337, + "learning_rate": 0.003, + "loss": 4.033, + "step": 28670 + }, + { + "epoch": 0.28671, + "grad_norm": 0.8258728722454893, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 28671 + }, + { + "epoch": 0.28672, + "grad_norm": 0.7601349614760261, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 28672 + }, + { + "epoch": 0.28673, + "grad_norm": 0.7376150996390509, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 28673 + }, + { + "epoch": 0.28674, + "grad_norm": 0.7635617471339808, + "learning_rate": 0.003, + "loss": 4.04, + "step": 28674 + }, + { + "epoch": 0.28675, + "grad_norm": 0.7483350875269462, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 28675 + }, + { + "epoch": 0.28676, + "grad_norm": 0.8020892072543045, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 28676 + }, + { + "epoch": 0.28677, + "grad_norm": 0.9414117627556049, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 28677 + }, + { + "epoch": 0.28678, + "grad_norm": 1.2307854652514987, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 28678 + }, + { + "epoch": 0.28679, + "grad_norm": 0.6755227858832771, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 28679 + }, + { + "epoch": 0.2868, + "grad_norm": 0.6376590984908987, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 28680 + }, + { + "epoch": 0.28681, + "grad_norm": 0.7023829520388835, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 28681 + }, + { + "epoch": 0.28682, + "grad_norm": 0.7487917300890066, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 28682 + }, + { + "epoch": 0.28683, + "grad_norm": 0.8087027166528684, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 28683 + }, + { + "epoch": 0.28684, + "grad_norm": 0.8709890369655098, + "learning_rate": 0.003, + "loss": 4.051, + "step": 28684 + }, + { + "epoch": 0.28685, + "grad_norm": 1.0119458085120403, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 28685 + }, + { + "epoch": 0.28686, + "grad_norm": 0.9936016035354587, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 28686 + }, + { + "epoch": 0.28687, + "grad_norm": 0.9684957849152305, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 28687 + }, + { + "epoch": 0.28688, + "grad_norm": 0.9873728898311748, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 28688 + }, + { + "epoch": 0.28689, + "grad_norm": 1.0965230949983917, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 28689 + }, + { + "epoch": 0.2869, + "grad_norm": 1.0044953846872462, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 28690 + }, + { + "epoch": 0.28691, + "grad_norm": 1.0788124342285026, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 28691 + }, + { + "epoch": 0.28692, + "grad_norm": 0.9866010364597724, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 28692 + }, + { + "epoch": 0.28693, + "grad_norm": 0.9531341576077421, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 28693 + }, + { + "epoch": 0.28694, + "grad_norm": 0.9289584530079585, + "learning_rate": 0.003, + "loss": 4.063, + "step": 28694 + }, + { + "epoch": 0.28695, + "grad_norm": 1.008529812332349, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 28695 + }, + { + "epoch": 0.28696, + "grad_norm": 1.199599042504214, + "learning_rate": 0.003, + "loss": 4.06, + "step": 28696 + }, + { + "epoch": 0.28697, + "grad_norm": 1.0406176220560317, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 28697 + }, + { + "epoch": 0.28698, + "grad_norm": 1.0664840432856348, + "learning_rate": 0.003, + "loss": 4.079, + "step": 28698 + }, + { + "epoch": 0.28699, + "grad_norm": 0.9749718717655039, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 28699 + }, + { + "epoch": 0.287, + "grad_norm": 0.9742398814310319, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 28700 + }, + { + "epoch": 0.28701, + "grad_norm": 0.9160562374714015, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 28701 + }, + { + "epoch": 0.28702, + "grad_norm": 0.9336203803684406, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 28702 + }, + { + "epoch": 0.28703, + "grad_norm": 1.0323900847739385, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 28703 + }, + { + "epoch": 0.28704, + "grad_norm": 1.045266676005433, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 28704 + }, + { + "epoch": 0.28705, + "grad_norm": 0.7458410017829225, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 28705 + }, + { + "epoch": 0.28706, + "grad_norm": 0.5410785406428957, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 28706 + }, + { + "epoch": 0.28707, + "grad_norm": 0.5228453310856197, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 28707 + }, + { + "epoch": 0.28708, + "grad_norm": 0.6608791603326084, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 28708 + }, + { + "epoch": 0.28709, + "grad_norm": 0.678162447287557, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 28709 + }, + { + "epoch": 0.2871, + "grad_norm": 0.6816561773919839, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 28710 + }, + { + "epoch": 0.28711, + "grad_norm": 0.6578772151230273, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 28711 + }, + { + "epoch": 0.28712, + "grad_norm": 0.6367433544764254, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 28712 + }, + { + "epoch": 0.28713, + "grad_norm": 0.6895836421817106, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 28713 + }, + { + "epoch": 0.28714, + "grad_norm": 0.8043269558426925, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 28714 + }, + { + "epoch": 0.28715, + "grad_norm": 0.8894859494955192, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 28715 + }, + { + "epoch": 0.28716, + "grad_norm": 0.9297689490711322, + "learning_rate": 0.003, + "loss": 4.022, + "step": 28716 + }, + { + "epoch": 0.28717, + "grad_norm": 0.8500603200789149, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 28717 + }, + { + "epoch": 0.28718, + "grad_norm": 0.8222017380156561, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 28718 + }, + { + "epoch": 0.28719, + "grad_norm": 0.7544456531885075, + "learning_rate": 0.003, + "loss": 4.054, + "step": 28719 + }, + { + "epoch": 0.2872, + "grad_norm": 0.743072546748044, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 28720 + }, + { + "epoch": 0.28721, + "grad_norm": 0.7478512798035548, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 28721 + }, + { + "epoch": 0.28722, + "grad_norm": 0.7275563534846363, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 28722 + }, + { + "epoch": 0.28723, + "grad_norm": 0.7003112390131832, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 28723 + }, + { + "epoch": 0.28724, + "grad_norm": 0.626165082003098, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 28724 + }, + { + "epoch": 0.28725, + "grad_norm": 0.6369236092179669, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 28725 + }, + { + "epoch": 0.28726, + "grad_norm": 0.6749614811922979, + "learning_rate": 0.003, + "loss": 4.037, + "step": 28726 + }, + { + "epoch": 0.28727, + "grad_norm": 0.7721855282634369, + "learning_rate": 0.003, + "loss": 4.025, + "step": 28727 + }, + { + "epoch": 0.28728, + "grad_norm": 1.0819574725746266, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 28728 + }, + { + "epoch": 0.28729, + "grad_norm": 1.0922023903011702, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 28729 + }, + { + "epoch": 0.2873, + "grad_norm": 0.7990870203089259, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 28730 + }, + { + "epoch": 0.28731, + "grad_norm": 0.7862977174274907, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 28731 + }, + { + "epoch": 0.28732, + "grad_norm": 0.9726626359080396, + "learning_rate": 0.003, + "loss": 4.048, + "step": 28732 + }, + { + "epoch": 0.28733, + "grad_norm": 0.9407940799156077, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 28733 + }, + { + "epoch": 0.28734, + "grad_norm": 0.7893996055339209, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 28734 + }, + { + "epoch": 0.28735, + "grad_norm": 0.8885659286168502, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 28735 + }, + { + "epoch": 0.28736, + "grad_norm": 0.9024200508878604, + "learning_rate": 0.003, + "loss": 4.019, + "step": 28736 + }, + { + "epoch": 0.28737, + "grad_norm": 1.0081483264653155, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 28737 + }, + { + "epoch": 0.28738, + "grad_norm": 1.105446664767159, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 28738 + }, + { + "epoch": 0.28739, + "grad_norm": 1.0604018609584864, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 28739 + }, + { + "epoch": 0.2874, + "grad_norm": 0.8987620176287859, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 28740 + }, + { + "epoch": 0.28741, + "grad_norm": 0.8639899383418974, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 28741 + }, + { + "epoch": 0.28742, + "grad_norm": 0.9162535230069342, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 28742 + }, + { + "epoch": 0.28743, + "grad_norm": 0.9799438023227245, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 28743 + }, + { + "epoch": 0.28744, + "grad_norm": 0.7935329609065511, + "learning_rate": 0.003, + "loss": 3.995, + "step": 28744 + }, + { + "epoch": 0.28745, + "grad_norm": 0.8338727445704894, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 28745 + }, + { + "epoch": 0.28746, + "grad_norm": 0.905266957909698, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 28746 + }, + { + "epoch": 0.28747, + "grad_norm": 0.8770940628299769, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 28747 + }, + { + "epoch": 0.28748, + "grad_norm": 0.8432510573101485, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 28748 + }, + { + "epoch": 0.28749, + "grad_norm": 0.7797301630800301, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 28749 + }, + { + "epoch": 0.2875, + "grad_norm": 0.7270976706839277, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 28750 + }, + { + "epoch": 0.28751, + "grad_norm": 0.6731775501500047, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 28751 + }, + { + "epoch": 0.28752, + "grad_norm": 0.6621028070033257, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 28752 + }, + { + "epoch": 0.28753, + "grad_norm": 0.7033534492577809, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 28753 + }, + { + "epoch": 0.28754, + "grad_norm": 0.8058519496340879, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 28754 + }, + { + "epoch": 0.28755, + "grad_norm": 1.0755681891079643, + "learning_rate": 0.003, + "loss": 4.058, + "step": 28755 + }, + { + "epoch": 0.28756, + "grad_norm": 1.1819136179865433, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 28756 + }, + { + "epoch": 0.28757, + "grad_norm": 0.8585584032443118, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 28757 + }, + { + "epoch": 0.28758, + "grad_norm": 0.8194575693359871, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 28758 + }, + { + "epoch": 0.28759, + "grad_norm": 0.7758675257039371, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 28759 + }, + { + "epoch": 0.2876, + "grad_norm": 0.8804873978770608, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 28760 + }, + { + "epoch": 0.28761, + "grad_norm": 1.0207675112557233, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 28761 + }, + { + "epoch": 0.28762, + "grad_norm": 1.0070142682175276, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 28762 + }, + { + "epoch": 0.28763, + "grad_norm": 0.9032608936747407, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 28763 + }, + { + "epoch": 0.28764, + "grad_norm": 0.7944451678710528, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 28764 + }, + { + "epoch": 0.28765, + "grad_norm": 0.9757005427019756, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 28765 + }, + { + "epoch": 0.28766, + "grad_norm": 1.0567364362593248, + "learning_rate": 0.003, + "loss": 4.084, + "step": 28766 + }, + { + "epoch": 0.28767, + "grad_norm": 1.1374987464847945, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 28767 + }, + { + "epoch": 0.28768, + "grad_norm": 0.6875343980740054, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 28768 + }, + { + "epoch": 0.28769, + "grad_norm": 0.5438746668219597, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 28769 + }, + { + "epoch": 0.2877, + "grad_norm": 0.5723042393513118, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 28770 + }, + { + "epoch": 0.28771, + "grad_norm": 0.5323460551822498, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 28771 + }, + { + "epoch": 0.28772, + "grad_norm": 0.5883430515516334, + "learning_rate": 0.003, + "loss": 4.048, + "step": 28772 + }, + { + "epoch": 0.28773, + "grad_norm": 0.6394326994428765, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 28773 + }, + { + "epoch": 0.28774, + "grad_norm": 0.7189978536918776, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 28774 + }, + { + "epoch": 0.28775, + "grad_norm": 0.7939359549607259, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 28775 + }, + { + "epoch": 0.28776, + "grad_norm": 0.9370543226277266, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 28776 + }, + { + "epoch": 0.28777, + "grad_norm": 1.0926709991177488, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 28777 + }, + { + "epoch": 0.28778, + "grad_norm": 0.849676799590963, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 28778 + }, + { + "epoch": 0.28779, + "grad_norm": 0.858728108204247, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 28779 + }, + { + "epoch": 0.2878, + "grad_norm": 0.8931529320563203, + "learning_rate": 0.003, + "loss": 4.03, + "step": 28780 + }, + { + "epoch": 0.28781, + "grad_norm": 0.9567121839009043, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 28781 + }, + { + "epoch": 0.28782, + "grad_norm": 1.2009268673973026, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 28782 + }, + { + "epoch": 0.28783, + "grad_norm": 0.6718118412740363, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 28783 + }, + { + "epoch": 0.28784, + "grad_norm": 0.6821660674523115, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 28784 + }, + { + "epoch": 0.28785, + "grad_norm": 0.7593966336022011, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 28785 + }, + { + "epoch": 0.28786, + "grad_norm": 0.8053404883518945, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 28786 + }, + { + "epoch": 0.28787, + "grad_norm": 0.8054525415212919, + "learning_rate": 0.003, + "loss": 4.021, + "step": 28787 + }, + { + "epoch": 0.28788, + "grad_norm": 0.7578412395228789, + "learning_rate": 0.003, + "loss": 4.029, + "step": 28788 + }, + { + "epoch": 0.28789, + "grad_norm": 0.7403241891017145, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 28789 + }, + { + "epoch": 0.2879, + "grad_norm": 0.6962959568470731, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 28790 + }, + { + "epoch": 0.28791, + "grad_norm": 0.6811919069561606, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 28791 + }, + { + "epoch": 0.28792, + "grad_norm": 0.7109143023297189, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 28792 + }, + { + "epoch": 0.28793, + "grad_norm": 0.7334447919163939, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 28793 + }, + { + "epoch": 0.28794, + "grad_norm": 0.8498749477540435, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 28794 + }, + { + "epoch": 0.28795, + "grad_norm": 1.0756288389505617, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 28795 + }, + { + "epoch": 0.28796, + "grad_norm": 1.0032295788374341, + "learning_rate": 0.003, + "loss": 4.039, + "step": 28796 + }, + { + "epoch": 0.28797, + "grad_norm": 0.9748250333262553, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 28797 + }, + { + "epoch": 0.28798, + "grad_norm": 1.0208362015596955, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 28798 + }, + { + "epoch": 0.28799, + "grad_norm": 0.9137221224484092, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 28799 + }, + { + "epoch": 0.288, + "grad_norm": 0.8194446088536961, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 28800 + }, + { + "epoch": 0.28801, + "grad_norm": 0.8639125685223914, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 28801 + }, + { + "epoch": 0.28802, + "grad_norm": 0.9747055878039127, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 28802 + }, + { + "epoch": 0.28803, + "grad_norm": 1.1328249528977736, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 28803 + }, + { + "epoch": 0.28804, + "grad_norm": 0.8552082138315015, + "learning_rate": 0.003, + "loss": 4.048, + "step": 28804 + }, + { + "epoch": 0.28805, + "grad_norm": 0.8190036674994713, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 28805 + }, + { + "epoch": 0.28806, + "grad_norm": 0.8373972133324258, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 28806 + }, + { + "epoch": 0.28807, + "grad_norm": 0.9410445186854962, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 28807 + }, + { + "epoch": 0.28808, + "grad_norm": 0.9707539856951962, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 28808 + }, + { + "epoch": 0.28809, + "grad_norm": 0.923905654628016, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 28809 + }, + { + "epoch": 0.2881, + "grad_norm": 1.0151648182286128, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 28810 + }, + { + "epoch": 0.28811, + "grad_norm": 1.0835985748612, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 28811 + }, + { + "epoch": 0.28812, + "grad_norm": 0.9070135643145147, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 28812 + }, + { + "epoch": 0.28813, + "grad_norm": 0.8059434748656653, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 28813 + }, + { + "epoch": 0.28814, + "grad_norm": 0.7204358888281435, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 28814 + }, + { + "epoch": 0.28815, + "grad_norm": 0.6784356433417426, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 28815 + }, + { + "epoch": 0.28816, + "grad_norm": 0.7295146138667735, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 28816 + }, + { + "epoch": 0.28817, + "grad_norm": 0.8349732600018718, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 28817 + }, + { + "epoch": 0.28818, + "grad_norm": 0.9393081716033626, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 28818 + }, + { + "epoch": 0.28819, + "grad_norm": 1.0553506114743156, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 28819 + }, + { + "epoch": 0.2882, + "grad_norm": 1.0593675142873102, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 28820 + }, + { + "epoch": 0.28821, + "grad_norm": 1.0392547297037715, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 28821 + }, + { + "epoch": 0.28822, + "grad_norm": 1.0008005915660343, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 28822 + }, + { + "epoch": 0.28823, + "grad_norm": 0.9938080828193561, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 28823 + }, + { + "epoch": 0.28824, + "grad_norm": 0.992620849940634, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 28824 + }, + { + "epoch": 0.28825, + "grad_norm": 0.9725729850849965, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 28825 + }, + { + "epoch": 0.28826, + "grad_norm": 0.9224416538826993, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 28826 + }, + { + "epoch": 0.28827, + "grad_norm": 1.001226553359419, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 28827 + }, + { + "epoch": 0.28828, + "grad_norm": 0.9684047594386208, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 28828 + }, + { + "epoch": 0.28829, + "grad_norm": 0.9621480717888037, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 28829 + }, + { + "epoch": 0.2883, + "grad_norm": 1.0187155697994454, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 28830 + }, + { + "epoch": 0.28831, + "grad_norm": 1.05891288616295, + "learning_rate": 0.003, + "loss": 4.057, + "step": 28831 + }, + { + "epoch": 0.28832, + "grad_norm": 0.9899475657299306, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 28832 + }, + { + "epoch": 0.28833, + "grad_norm": 0.9037773444226085, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 28833 + }, + { + "epoch": 0.28834, + "grad_norm": 0.8439236963182871, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 28834 + }, + { + "epoch": 0.28835, + "grad_norm": 0.8056800710415504, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 28835 + }, + { + "epoch": 0.28836, + "grad_norm": 0.6498983608735496, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 28836 + }, + { + "epoch": 0.28837, + "grad_norm": 0.5384200235370281, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 28837 + }, + { + "epoch": 0.28838, + "grad_norm": 0.5717915710151493, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 28838 + }, + { + "epoch": 0.28839, + "grad_norm": 0.6113261571751487, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 28839 + }, + { + "epoch": 0.2884, + "grad_norm": 0.6704764870226773, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 28840 + }, + { + "epoch": 0.28841, + "grad_norm": 0.7472050478435659, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 28841 + }, + { + "epoch": 0.28842, + "grad_norm": 0.7147134736800776, + "learning_rate": 0.003, + "loss": 4.048, + "step": 28842 + }, + { + "epoch": 0.28843, + "grad_norm": 0.625772791293347, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 28843 + }, + { + "epoch": 0.28844, + "grad_norm": 0.6552903800807355, + "learning_rate": 0.003, + "loss": 4.036, + "step": 28844 + }, + { + "epoch": 0.28845, + "grad_norm": 0.7106876907646548, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 28845 + }, + { + "epoch": 0.28846, + "grad_norm": 0.842710154032099, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 28846 + }, + { + "epoch": 0.28847, + "grad_norm": 0.9798976966727766, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 28847 + }, + { + "epoch": 0.28848, + "grad_norm": 1.1207163970792315, + "learning_rate": 0.003, + "loss": 4.0057, + "step": 28848 + }, + { + "epoch": 0.28849, + "grad_norm": 0.9405878686179455, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 28849 + }, + { + "epoch": 0.2885, + "grad_norm": 0.8478092083046985, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 28850 + }, + { + "epoch": 0.28851, + "grad_norm": 0.8610464189253911, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 28851 + }, + { + "epoch": 0.28852, + "grad_norm": 0.9114224049556959, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 28852 + }, + { + "epoch": 0.28853, + "grad_norm": 0.7872326957994091, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 28853 + }, + { + "epoch": 0.28854, + "grad_norm": 0.6969657206543184, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 28854 + }, + { + "epoch": 0.28855, + "grad_norm": 0.6824396135455695, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 28855 + }, + { + "epoch": 0.28856, + "grad_norm": 0.7226855467866632, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 28856 + }, + { + "epoch": 0.28857, + "grad_norm": 0.7081791347421768, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 28857 + }, + { + "epoch": 0.28858, + "grad_norm": 0.647868338067922, + "learning_rate": 0.003, + "loss": 4.029, + "step": 28858 + }, + { + "epoch": 0.28859, + "grad_norm": 0.5894469718862403, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 28859 + }, + { + "epoch": 0.2886, + "grad_norm": 0.5572159828049006, + "learning_rate": 0.003, + "loss": 4.052, + "step": 28860 + }, + { + "epoch": 0.28861, + "grad_norm": 0.7162841389792863, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 28861 + }, + { + "epoch": 0.28862, + "grad_norm": 0.9450464260186382, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 28862 + }, + { + "epoch": 0.28863, + "grad_norm": 1.1407901747760987, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 28863 + }, + { + "epoch": 0.28864, + "grad_norm": 0.9403691103361915, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 28864 + }, + { + "epoch": 0.28865, + "grad_norm": 0.9196587668469819, + "learning_rate": 0.003, + "loss": 4.03, + "step": 28865 + }, + { + "epoch": 0.28866, + "grad_norm": 0.7787395306860375, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 28866 + }, + { + "epoch": 0.28867, + "grad_norm": 0.8920242004885823, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 28867 + }, + { + "epoch": 0.28868, + "grad_norm": 0.9270328947617662, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 28868 + }, + { + "epoch": 0.28869, + "grad_norm": 0.9506191054783535, + "learning_rate": 0.003, + "loss": 4.038, + "step": 28869 + }, + { + "epoch": 0.2887, + "grad_norm": 1.0320246789051546, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 28870 + }, + { + "epoch": 0.28871, + "grad_norm": 0.9396029047915674, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 28871 + }, + { + "epoch": 0.28872, + "grad_norm": 1.0876543182870668, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 28872 + }, + { + "epoch": 0.28873, + "grad_norm": 1.145933539183094, + "learning_rate": 0.003, + "loss": 4.039, + "step": 28873 + }, + { + "epoch": 0.28874, + "grad_norm": 1.0098045356186232, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 28874 + }, + { + "epoch": 0.28875, + "grad_norm": 0.9141701900183897, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 28875 + }, + { + "epoch": 0.28876, + "grad_norm": 0.9290129530760866, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 28876 + }, + { + "epoch": 0.28877, + "grad_norm": 0.8900875184167762, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 28877 + }, + { + "epoch": 0.28878, + "grad_norm": 0.9064701353647765, + "learning_rate": 0.003, + "loss": 4.044, + "step": 28878 + }, + { + "epoch": 0.28879, + "grad_norm": 0.8710248324815222, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 28879 + }, + { + "epoch": 0.2888, + "grad_norm": 0.8722735598674553, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 28880 + }, + { + "epoch": 0.28881, + "grad_norm": 0.8454826902457475, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 28881 + }, + { + "epoch": 0.28882, + "grad_norm": 0.9500881071323944, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 28882 + }, + { + "epoch": 0.28883, + "grad_norm": 0.9746945926448343, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 28883 + }, + { + "epoch": 0.28884, + "grad_norm": 1.142877424621866, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 28884 + }, + { + "epoch": 0.28885, + "grad_norm": 0.9559126363643763, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 28885 + }, + { + "epoch": 0.28886, + "grad_norm": 0.851044036397433, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 28886 + }, + { + "epoch": 0.28887, + "grad_norm": 0.8085349619339791, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 28887 + }, + { + "epoch": 0.28888, + "grad_norm": 0.8599276643573764, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 28888 + }, + { + "epoch": 0.28889, + "grad_norm": 0.7579893657603481, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 28889 + }, + { + "epoch": 0.2889, + "grad_norm": 0.7261995900233258, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 28890 + }, + { + "epoch": 0.28891, + "grad_norm": 0.8164054520380887, + "learning_rate": 0.003, + "loss": 4.027, + "step": 28891 + }, + { + "epoch": 0.28892, + "grad_norm": 0.9074819684848089, + "learning_rate": 0.003, + "loss": 4.034, + "step": 28892 + }, + { + "epoch": 0.28893, + "grad_norm": 0.9588242370465906, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 28893 + }, + { + "epoch": 0.28894, + "grad_norm": 0.8573618589972752, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 28894 + }, + { + "epoch": 0.28895, + "grad_norm": 0.7146283171071297, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 28895 + }, + { + "epoch": 0.28896, + "grad_norm": 0.6214239937521602, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 28896 + }, + { + "epoch": 0.28897, + "grad_norm": 0.5434195094438631, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 28897 + }, + { + "epoch": 0.28898, + "grad_norm": 0.5603548809203606, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 28898 + }, + { + "epoch": 0.28899, + "grad_norm": 0.6039629738600372, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 28899 + }, + { + "epoch": 0.289, + "grad_norm": 0.7245867621656004, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 28900 + }, + { + "epoch": 0.28901, + "grad_norm": 0.8821930362899142, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 28901 + }, + { + "epoch": 0.28902, + "grad_norm": 1.113326275254241, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 28902 + }, + { + "epoch": 0.28903, + "grad_norm": 0.8840333424071184, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 28903 + }, + { + "epoch": 0.28904, + "grad_norm": 0.9515853769265549, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 28904 + }, + { + "epoch": 0.28905, + "grad_norm": 0.9406130391232622, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 28905 + }, + { + "epoch": 0.28906, + "grad_norm": 0.8085226066854387, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 28906 + }, + { + "epoch": 0.28907, + "grad_norm": 0.8497668464161044, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 28907 + }, + { + "epoch": 0.28908, + "grad_norm": 0.7258700497289223, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 28908 + }, + { + "epoch": 0.28909, + "grad_norm": 0.5796652716136772, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 28909 + }, + { + "epoch": 0.2891, + "grad_norm": 0.5773452600720476, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 28910 + }, + { + "epoch": 0.28911, + "grad_norm": 0.6489792879303007, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 28911 + }, + { + "epoch": 0.28912, + "grad_norm": 0.7451795230437973, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 28912 + }, + { + "epoch": 0.28913, + "grad_norm": 0.8879992894495066, + "learning_rate": 0.003, + "loss": 4.018, + "step": 28913 + }, + { + "epoch": 0.28914, + "grad_norm": 0.9263451341884663, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 28914 + }, + { + "epoch": 0.28915, + "grad_norm": 0.8917579206926248, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 28915 + }, + { + "epoch": 0.28916, + "grad_norm": 0.8607799072119376, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 28916 + }, + { + "epoch": 0.28917, + "grad_norm": 0.8118260330090631, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 28917 + }, + { + "epoch": 0.28918, + "grad_norm": 0.767832759979061, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 28918 + }, + { + "epoch": 0.28919, + "grad_norm": 0.766954785577315, + "learning_rate": 0.003, + "loss": 4.01, + "step": 28919 + }, + { + "epoch": 0.2892, + "grad_norm": 0.9121939162221603, + "learning_rate": 0.003, + "loss": 4.04, + "step": 28920 + }, + { + "epoch": 0.28921, + "grad_norm": 1.1709053490028063, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 28921 + }, + { + "epoch": 0.28922, + "grad_norm": 0.8883039525212179, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 28922 + }, + { + "epoch": 0.28923, + "grad_norm": 0.7980417516190044, + "learning_rate": 0.003, + "loss": 4.014, + "step": 28923 + }, + { + "epoch": 0.28924, + "grad_norm": 0.8725842407269302, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 28924 + }, + { + "epoch": 0.28925, + "grad_norm": 0.941083131137281, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 28925 + }, + { + "epoch": 0.28926, + "grad_norm": 0.8662753961081622, + "learning_rate": 0.003, + "loss": 4.006, + "step": 28926 + }, + { + "epoch": 0.28927, + "grad_norm": 0.8161538545027825, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 28927 + }, + { + "epoch": 0.28928, + "grad_norm": 0.8914212846765761, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 28928 + }, + { + "epoch": 0.28929, + "grad_norm": 1.058540640912842, + "learning_rate": 0.003, + "loss": 4.063, + "step": 28929 + }, + { + "epoch": 0.2893, + "grad_norm": 1.2074767942989324, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 28930 + }, + { + "epoch": 0.28931, + "grad_norm": 0.8109176845733301, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 28931 + }, + { + "epoch": 0.28932, + "grad_norm": 0.7664234781087832, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 28932 + }, + { + "epoch": 0.28933, + "grad_norm": 0.8489470995187731, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 28933 + }, + { + "epoch": 0.28934, + "grad_norm": 0.8087517299385847, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 28934 + }, + { + "epoch": 0.28935, + "grad_norm": 0.8193664487211005, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 28935 + }, + { + "epoch": 0.28936, + "grad_norm": 0.8405605650285187, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 28936 + }, + { + "epoch": 0.28937, + "grad_norm": 0.7667862875405236, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 28937 + }, + { + "epoch": 0.28938, + "grad_norm": 0.7362073526187615, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 28938 + }, + { + "epoch": 0.28939, + "grad_norm": 0.8217980027767611, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 28939 + }, + { + "epoch": 0.2894, + "grad_norm": 0.8919069308650239, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 28940 + }, + { + "epoch": 0.28941, + "grad_norm": 1.0021654430890485, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 28941 + }, + { + "epoch": 0.28942, + "grad_norm": 1.1888096855212755, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 28942 + }, + { + "epoch": 0.28943, + "grad_norm": 0.9588812091279149, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 28943 + }, + { + "epoch": 0.28944, + "grad_norm": 0.96319800565272, + "learning_rate": 0.003, + "loss": 4.041, + "step": 28944 + }, + { + "epoch": 0.28945, + "grad_norm": 0.8708084918092808, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 28945 + }, + { + "epoch": 0.28946, + "grad_norm": 0.8233991791945643, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 28946 + }, + { + "epoch": 0.28947, + "grad_norm": 0.7912094511926252, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 28947 + }, + { + "epoch": 0.28948, + "grad_norm": 0.7349432140474855, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 28948 + }, + { + "epoch": 0.28949, + "grad_norm": 0.7882877328977578, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 28949 + }, + { + "epoch": 0.2895, + "grad_norm": 0.8479933097718767, + "learning_rate": 0.003, + "loss": 4.033, + "step": 28950 + }, + { + "epoch": 0.28951, + "grad_norm": 0.8238371714602934, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 28951 + }, + { + "epoch": 0.28952, + "grad_norm": 0.7833981751328564, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 28952 + }, + { + "epoch": 0.28953, + "grad_norm": 0.7779321286986476, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 28953 + }, + { + "epoch": 0.28954, + "grad_norm": 0.9222481273354509, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 28954 + }, + { + "epoch": 0.28955, + "grad_norm": 1.0553150709426524, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 28955 + }, + { + "epoch": 0.28956, + "grad_norm": 0.9296024386697446, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 28956 + }, + { + "epoch": 0.28957, + "grad_norm": 0.8057004486816919, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 28957 + }, + { + "epoch": 0.28958, + "grad_norm": 0.711702985987495, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 28958 + }, + { + "epoch": 0.28959, + "grad_norm": 0.7211715957752926, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 28959 + }, + { + "epoch": 0.2896, + "grad_norm": 0.819124571215337, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 28960 + }, + { + "epoch": 0.28961, + "grad_norm": 0.7952229814352482, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 28961 + }, + { + "epoch": 0.28962, + "grad_norm": 0.7533518860602568, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 28962 + }, + { + "epoch": 0.28963, + "grad_norm": 0.8246125360983851, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 28963 + }, + { + "epoch": 0.28964, + "grad_norm": 0.8897105814362145, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 28964 + }, + { + "epoch": 0.28965, + "grad_norm": 1.0294537201267124, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 28965 + }, + { + "epoch": 0.28966, + "grad_norm": 1.0502585848701835, + "learning_rate": 0.003, + "loss": 4.058, + "step": 28966 + }, + { + "epoch": 0.28967, + "grad_norm": 0.92284934628861, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 28967 + }, + { + "epoch": 0.28968, + "grad_norm": 0.9363594262704179, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 28968 + }, + { + "epoch": 0.28969, + "grad_norm": 0.8690761993032236, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 28969 + }, + { + "epoch": 0.2897, + "grad_norm": 0.824409475588958, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 28970 + }, + { + "epoch": 0.28971, + "grad_norm": 0.8035164095434366, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 28971 + }, + { + "epoch": 0.28972, + "grad_norm": 0.7589006996790664, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 28972 + }, + { + "epoch": 0.28973, + "grad_norm": 0.7615179657641672, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 28973 + }, + { + "epoch": 0.28974, + "grad_norm": 0.8884397920607304, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 28974 + }, + { + "epoch": 0.28975, + "grad_norm": 1.1910551675567682, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 28975 + }, + { + "epoch": 0.28976, + "grad_norm": 0.8726550101565395, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 28976 + }, + { + "epoch": 0.28977, + "grad_norm": 0.8365635196242915, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 28977 + }, + { + "epoch": 0.28978, + "grad_norm": 0.903801751634053, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 28978 + }, + { + "epoch": 0.28979, + "grad_norm": 0.9267446222623441, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 28979 + }, + { + "epoch": 0.2898, + "grad_norm": 0.9326200584049709, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 28980 + }, + { + "epoch": 0.28981, + "grad_norm": 0.9669692812758987, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 28981 + }, + { + "epoch": 0.28982, + "grad_norm": 0.966563366586481, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 28982 + }, + { + "epoch": 0.28983, + "grad_norm": 0.8688769261203574, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 28983 + }, + { + "epoch": 0.28984, + "grad_norm": 0.7425761664345544, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 28984 + }, + { + "epoch": 0.28985, + "grad_norm": 0.719820241920268, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 28985 + }, + { + "epoch": 0.28986, + "grad_norm": 0.7084364008142771, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 28986 + }, + { + "epoch": 0.28987, + "grad_norm": 0.6363119903990807, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 28987 + }, + { + "epoch": 0.28988, + "grad_norm": 0.6195040204692277, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 28988 + }, + { + "epoch": 0.28989, + "grad_norm": 0.6076246697955769, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 28989 + }, + { + "epoch": 0.2899, + "grad_norm": 0.7139172986381505, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 28990 + }, + { + "epoch": 0.28991, + "grad_norm": 0.9037942607649624, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 28991 + }, + { + "epoch": 0.28992, + "grad_norm": 1.0507100135052365, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 28992 + }, + { + "epoch": 0.28993, + "grad_norm": 0.8792692268518675, + "learning_rate": 0.003, + "loss": 4.0055, + "step": 28993 + }, + { + "epoch": 0.28994, + "grad_norm": 0.8223857944248871, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 28994 + }, + { + "epoch": 0.28995, + "grad_norm": 0.9360105325980802, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 28995 + }, + { + "epoch": 0.28996, + "grad_norm": 0.9247845156632187, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 28996 + }, + { + "epoch": 0.28997, + "grad_norm": 0.9075118864410737, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 28997 + }, + { + "epoch": 0.28998, + "grad_norm": 0.9274920322665163, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 28998 + }, + { + "epoch": 0.28999, + "grad_norm": 0.9301193002309273, + "learning_rate": 0.003, + "loss": 4.062, + "step": 28999 + }, + { + "epoch": 0.29, + "grad_norm": 0.8063186387535685, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 29000 + }, + { + "epoch": 0.29001, + "grad_norm": 0.6998469161690216, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 29001 + }, + { + "epoch": 0.29002, + "grad_norm": 0.6628655320968777, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 29002 + }, + { + "epoch": 0.29003, + "grad_norm": 0.7055446184701589, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 29003 + }, + { + "epoch": 0.29004, + "grad_norm": 0.7977392001401715, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 29004 + }, + { + "epoch": 0.29005, + "grad_norm": 0.9663136620822266, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 29005 + }, + { + "epoch": 0.29006, + "grad_norm": 1.0732239970082185, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 29006 + }, + { + "epoch": 0.29007, + "grad_norm": 0.8839605849640741, + "learning_rate": 0.003, + "loss": 4.0087, + "step": 29007 + }, + { + "epoch": 0.29008, + "grad_norm": 1.0081742265128457, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 29008 + }, + { + "epoch": 0.29009, + "grad_norm": 1.1669634331025756, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 29009 + }, + { + "epoch": 0.2901, + "grad_norm": 0.8496385840039626, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 29010 + }, + { + "epoch": 0.29011, + "grad_norm": 0.7928554066794617, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 29011 + }, + { + "epoch": 0.29012, + "grad_norm": 0.8540558027110825, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 29012 + }, + { + "epoch": 0.29013, + "grad_norm": 0.7824705400205778, + "learning_rate": 0.003, + "loss": 4.07, + "step": 29013 + }, + { + "epoch": 0.29014, + "grad_norm": 0.7508384438289558, + "learning_rate": 0.003, + "loss": 4.066, + "step": 29014 + }, + { + "epoch": 0.29015, + "grad_norm": 0.849971212923941, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 29015 + }, + { + "epoch": 0.29016, + "grad_norm": 0.975196621925759, + "learning_rate": 0.003, + "loss": 4.048, + "step": 29016 + }, + { + "epoch": 0.29017, + "grad_norm": 1.1610772629896862, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 29017 + }, + { + "epoch": 0.29018, + "grad_norm": 0.9337849370046681, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 29018 + }, + { + "epoch": 0.29019, + "grad_norm": 0.7789384036323321, + "learning_rate": 0.003, + "loss": 4.048, + "step": 29019 + }, + { + "epoch": 0.2902, + "grad_norm": 0.7247002327345203, + "learning_rate": 0.003, + "loss": 4.03, + "step": 29020 + }, + { + "epoch": 0.29021, + "grad_norm": 0.8343945083180758, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 29021 + }, + { + "epoch": 0.29022, + "grad_norm": 0.9794451664189109, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 29022 + }, + { + "epoch": 0.29023, + "grad_norm": 1.1109540449905884, + "learning_rate": 0.003, + "loss": 4.04, + "step": 29023 + }, + { + "epoch": 0.29024, + "grad_norm": 0.9064800008464814, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 29024 + }, + { + "epoch": 0.29025, + "grad_norm": 0.8810147389983761, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 29025 + }, + { + "epoch": 0.29026, + "grad_norm": 1.0028645438064123, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 29026 + }, + { + "epoch": 0.29027, + "grad_norm": 1.0417299244975535, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 29027 + }, + { + "epoch": 0.29028, + "grad_norm": 0.9347711329717023, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 29028 + }, + { + "epoch": 0.29029, + "grad_norm": 0.855309585154985, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 29029 + }, + { + "epoch": 0.2903, + "grad_norm": 0.88988367402162, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 29030 + }, + { + "epoch": 0.29031, + "grad_norm": 1.0074415556997198, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 29031 + }, + { + "epoch": 0.29032, + "grad_norm": 1.0013748834042207, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 29032 + }, + { + "epoch": 0.29033, + "grad_norm": 1.0032936374874766, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 29033 + }, + { + "epoch": 0.29034, + "grad_norm": 1.0440594839172814, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 29034 + }, + { + "epoch": 0.29035, + "grad_norm": 0.9621889667334785, + "learning_rate": 0.003, + "loss": 4.055, + "step": 29035 + }, + { + "epoch": 0.29036, + "grad_norm": 0.8260609841561933, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 29036 + }, + { + "epoch": 0.29037, + "grad_norm": 0.6968933938847688, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 29037 + }, + { + "epoch": 0.29038, + "grad_norm": 0.82885146563968, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 29038 + }, + { + "epoch": 0.29039, + "grad_norm": 0.9718326696076595, + "learning_rate": 0.003, + "loss": 4.087, + "step": 29039 + }, + { + "epoch": 0.2904, + "grad_norm": 0.9866022752244584, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 29040 + }, + { + "epoch": 0.29041, + "grad_norm": 0.958379521653482, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 29041 + }, + { + "epoch": 0.29042, + "grad_norm": 0.7910850266352244, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 29042 + }, + { + "epoch": 0.29043, + "grad_norm": 0.9021784771549576, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 29043 + }, + { + "epoch": 0.29044, + "grad_norm": 1.0413643594825606, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 29044 + }, + { + "epoch": 0.29045, + "grad_norm": 1.0311816862456207, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 29045 + }, + { + "epoch": 0.29046, + "grad_norm": 1.0162135162183141, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 29046 + }, + { + "epoch": 0.29047, + "grad_norm": 1.030962499576916, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 29047 + }, + { + "epoch": 0.29048, + "grad_norm": 1.1003512882897302, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 29048 + }, + { + "epoch": 0.29049, + "grad_norm": 0.7823182390929905, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 29049 + }, + { + "epoch": 0.2905, + "grad_norm": 0.6700577133750076, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 29050 + }, + { + "epoch": 0.29051, + "grad_norm": 0.5724679014614362, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 29051 + }, + { + "epoch": 0.29052, + "grad_norm": 0.5446329882275273, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 29052 + }, + { + "epoch": 0.29053, + "grad_norm": 0.5767621905010737, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 29053 + }, + { + "epoch": 0.29054, + "grad_norm": 0.6393633539014837, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 29054 + }, + { + "epoch": 0.29055, + "grad_norm": 0.7523622361464737, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 29055 + }, + { + "epoch": 0.29056, + "grad_norm": 0.8856001813867815, + "learning_rate": 0.003, + "loss": 4.039, + "step": 29056 + }, + { + "epoch": 0.29057, + "grad_norm": 1.0676416978946066, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 29057 + }, + { + "epoch": 0.29058, + "grad_norm": 0.882066029945932, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 29058 + }, + { + "epoch": 0.29059, + "grad_norm": 0.641676822051498, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 29059 + }, + { + "epoch": 0.2906, + "grad_norm": 0.5952505024829797, + "learning_rate": 0.003, + "loss": 4.015, + "step": 29060 + }, + { + "epoch": 0.29061, + "grad_norm": 0.7347464262698876, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 29061 + }, + { + "epoch": 0.29062, + "grad_norm": 0.8049741919398687, + "learning_rate": 0.003, + "loss": 4.0066, + "step": 29062 + }, + { + "epoch": 0.29063, + "grad_norm": 0.7292550614254075, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 29063 + }, + { + "epoch": 0.29064, + "grad_norm": 0.64751892168709, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 29064 + }, + { + "epoch": 0.29065, + "grad_norm": 0.6988684266375307, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 29065 + }, + { + "epoch": 0.29066, + "grad_norm": 0.7879739710066137, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 29066 + }, + { + "epoch": 0.29067, + "grad_norm": 0.7912355083608571, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 29067 + }, + { + "epoch": 0.29068, + "grad_norm": 0.7466234972113204, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 29068 + }, + { + "epoch": 0.29069, + "grad_norm": 0.7467073765721259, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 29069 + }, + { + "epoch": 0.2907, + "grad_norm": 0.7522997760474645, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 29070 + }, + { + "epoch": 0.29071, + "grad_norm": 0.7945991428872144, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 29071 + }, + { + "epoch": 0.29072, + "grad_norm": 0.9778435310244655, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 29072 + }, + { + "epoch": 0.29073, + "grad_norm": 1.2424869038419357, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 29073 + }, + { + "epoch": 0.29074, + "grad_norm": 0.6971897608627028, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 29074 + }, + { + "epoch": 0.29075, + "grad_norm": 0.6244573819422945, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 29075 + }, + { + "epoch": 0.29076, + "grad_norm": 0.5621536073217575, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 29076 + }, + { + "epoch": 0.29077, + "grad_norm": 0.5955133103667601, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 29077 + }, + { + "epoch": 0.29078, + "grad_norm": 0.636876995375685, + "learning_rate": 0.003, + "loss": 4.01, + "step": 29078 + }, + { + "epoch": 0.29079, + "grad_norm": 0.7145822512966056, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 29079 + }, + { + "epoch": 0.2908, + "grad_norm": 0.8587816043339325, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 29080 + }, + { + "epoch": 0.29081, + "grad_norm": 0.9886681826339178, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 29081 + }, + { + "epoch": 0.29082, + "grad_norm": 1.1663563796101664, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 29082 + }, + { + "epoch": 0.29083, + "grad_norm": 1.0939174611670122, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 29083 + }, + { + "epoch": 0.29084, + "grad_norm": 0.9684962154354666, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 29084 + }, + { + "epoch": 0.29085, + "grad_norm": 0.9636114080243234, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 29085 + }, + { + "epoch": 0.29086, + "grad_norm": 1.0003433449468262, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 29086 + }, + { + "epoch": 0.29087, + "grad_norm": 0.9987590940573388, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 29087 + }, + { + "epoch": 0.29088, + "grad_norm": 0.9677320768125557, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 29088 + }, + { + "epoch": 0.29089, + "grad_norm": 0.9013250872209218, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 29089 + }, + { + "epoch": 0.2909, + "grad_norm": 0.8541432778093003, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 29090 + }, + { + "epoch": 0.29091, + "grad_norm": 0.7774036175780287, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 29091 + }, + { + "epoch": 0.29092, + "grad_norm": 0.7504335544785947, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 29092 + }, + { + "epoch": 0.29093, + "grad_norm": 0.7248556266488336, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 29093 + }, + { + "epoch": 0.29094, + "grad_norm": 0.7611453350552821, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 29094 + }, + { + "epoch": 0.29095, + "grad_norm": 0.9036233775356234, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 29095 + }, + { + "epoch": 0.29096, + "grad_norm": 0.9608494345301011, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 29096 + }, + { + "epoch": 0.29097, + "grad_norm": 1.0865538487596067, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 29097 + }, + { + "epoch": 0.29098, + "grad_norm": 1.0088182214699868, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 29098 + }, + { + "epoch": 0.29099, + "grad_norm": 0.8686632910774595, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 29099 + }, + { + "epoch": 0.291, + "grad_norm": 0.7632585279792381, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 29100 + }, + { + "epoch": 0.29101, + "grad_norm": 0.9137215478277559, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 29101 + }, + { + "epoch": 0.29102, + "grad_norm": 1.0876449179019083, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 29102 + }, + { + "epoch": 0.29103, + "grad_norm": 1.091843808926389, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 29103 + }, + { + "epoch": 0.29104, + "grad_norm": 1.1068651830643574, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 29104 + }, + { + "epoch": 0.29105, + "grad_norm": 1.0828130287017224, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 29105 + }, + { + "epoch": 0.29106, + "grad_norm": 0.8604116004067545, + "learning_rate": 0.003, + "loss": 4.055, + "step": 29106 + }, + { + "epoch": 0.29107, + "grad_norm": 0.7828077053221636, + "learning_rate": 0.003, + "loss": 4.02, + "step": 29107 + }, + { + "epoch": 0.29108, + "grad_norm": 0.7767082333441759, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 29108 + }, + { + "epoch": 0.29109, + "grad_norm": 0.7946975472712052, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 29109 + }, + { + "epoch": 0.2911, + "grad_norm": 0.7326015559378439, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 29110 + }, + { + "epoch": 0.29111, + "grad_norm": 0.6481010880823052, + "learning_rate": 0.003, + "loss": 4.052, + "step": 29111 + }, + { + "epoch": 0.29112, + "grad_norm": 0.5679321995182853, + "learning_rate": 0.003, + "loss": 4.053, + "step": 29112 + }, + { + "epoch": 0.29113, + "grad_norm": 0.6982882691454518, + "learning_rate": 0.003, + "loss": 4.0018, + "step": 29113 + }, + { + "epoch": 0.29114, + "grad_norm": 0.8768824511204043, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 29114 + }, + { + "epoch": 0.29115, + "grad_norm": 1.009020915345648, + "learning_rate": 0.003, + "loss": 4.04, + "step": 29115 + }, + { + "epoch": 0.29116, + "grad_norm": 1.1124340972151603, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 29116 + }, + { + "epoch": 0.29117, + "grad_norm": 0.8486503157805139, + "learning_rate": 0.003, + "loss": 4.061, + "step": 29117 + }, + { + "epoch": 0.29118, + "grad_norm": 0.7301274138635538, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 29118 + }, + { + "epoch": 0.29119, + "grad_norm": 0.677504566770362, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 29119 + }, + { + "epoch": 0.2912, + "grad_norm": 0.7040692663735684, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 29120 + }, + { + "epoch": 0.29121, + "grad_norm": 0.7662274462026148, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 29121 + }, + { + "epoch": 0.29122, + "grad_norm": 0.8316691644135649, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 29122 + }, + { + "epoch": 0.29123, + "grad_norm": 0.8281910237430045, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 29123 + }, + { + "epoch": 0.29124, + "grad_norm": 0.7926456141834706, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 29124 + }, + { + "epoch": 0.29125, + "grad_norm": 0.7986394671081941, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 29125 + }, + { + "epoch": 0.29126, + "grad_norm": 0.8069511561975887, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 29126 + }, + { + "epoch": 0.29127, + "grad_norm": 0.8303218114562186, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 29127 + }, + { + "epoch": 0.29128, + "grad_norm": 0.8826488097190619, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 29128 + }, + { + "epoch": 0.29129, + "grad_norm": 0.7946248949147684, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 29129 + }, + { + "epoch": 0.2913, + "grad_norm": 0.7271279939730684, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 29130 + }, + { + "epoch": 0.29131, + "grad_norm": 0.7644075406714997, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 29131 + }, + { + "epoch": 0.29132, + "grad_norm": 0.8223171611208211, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 29132 + }, + { + "epoch": 0.29133, + "grad_norm": 0.9724050191743714, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 29133 + }, + { + "epoch": 0.29134, + "grad_norm": 1.0140588538320097, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 29134 + }, + { + "epoch": 0.29135, + "grad_norm": 0.8474992815444606, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 29135 + }, + { + "epoch": 0.29136, + "grad_norm": 0.7710624274013632, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 29136 + }, + { + "epoch": 0.29137, + "grad_norm": 0.8816974042118865, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 29137 + }, + { + "epoch": 0.29138, + "grad_norm": 0.9964430355257407, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 29138 + }, + { + "epoch": 0.29139, + "grad_norm": 0.8346197104328477, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 29139 + }, + { + "epoch": 0.2914, + "grad_norm": 0.8363763717419509, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 29140 + }, + { + "epoch": 0.29141, + "grad_norm": 0.9938523638387123, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 29141 + }, + { + "epoch": 0.29142, + "grad_norm": 1.031964046885035, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 29142 + }, + { + "epoch": 0.29143, + "grad_norm": 1.0924702721225699, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 29143 + }, + { + "epoch": 0.29144, + "grad_norm": 1.082177194103632, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 29144 + }, + { + "epoch": 0.29145, + "grad_norm": 1.137026832348986, + "learning_rate": 0.003, + "loss": 4.086, + "step": 29145 + }, + { + "epoch": 0.29146, + "grad_norm": 0.9801174501113451, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 29146 + }, + { + "epoch": 0.29147, + "grad_norm": 1.105314278321518, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 29147 + }, + { + "epoch": 0.29148, + "grad_norm": 1.029273748969058, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 29148 + }, + { + "epoch": 0.29149, + "grad_norm": 1.0235732170606477, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 29149 + }, + { + "epoch": 0.2915, + "grad_norm": 0.8434851133467528, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 29150 + }, + { + "epoch": 0.29151, + "grad_norm": 0.7950055688953716, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 29151 + }, + { + "epoch": 0.29152, + "grad_norm": 0.8142024325311407, + "learning_rate": 0.003, + "loss": 4.06, + "step": 29152 + }, + { + "epoch": 0.29153, + "grad_norm": 0.8334174790882151, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 29153 + }, + { + "epoch": 0.29154, + "grad_norm": 0.9762462029590928, + "learning_rate": 0.003, + "loss": 4.062, + "step": 29154 + }, + { + "epoch": 0.29155, + "grad_norm": 0.9922185794266971, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 29155 + }, + { + "epoch": 0.29156, + "grad_norm": 0.8895402332661667, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 29156 + }, + { + "epoch": 0.29157, + "grad_norm": 0.7973012672503906, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 29157 + }, + { + "epoch": 0.29158, + "grad_norm": 0.7274269102993839, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 29158 + }, + { + "epoch": 0.29159, + "grad_norm": 0.6531038225079099, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 29159 + }, + { + "epoch": 0.2916, + "grad_norm": 0.6493453009214776, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 29160 + }, + { + "epoch": 0.29161, + "grad_norm": 0.7864708137416616, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 29161 + }, + { + "epoch": 0.29162, + "grad_norm": 0.8975987020304729, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 29162 + }, + { + "epoch": 0.29163, + "grad_norm": 1.0543012784670018, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 29163 + }, + { + "epoch": 0.29164, + "grad_norm": 0.8735621383671371, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 29164 + }, + { + "epoch": 0.29165, + "grad_norm": 0.7138285495075147, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 29165 + }, + { + "epoch": 0.29166, + "grad_norm": 0.6870998365700985, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 29166 + }, + { + "epoch": 0.29167, + "grad_norm": 0.7753553410597777, + "learning_rate": 0.003, + "loss": 4.031, + "step": 29167 + }, + { + "epoch": 0.29168, + "grad_norm": 0.9323123392932248, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 29168 + }, + { + "epoch": 0.29169, + "grad_norm": 0.9055132492437714, + "learning_rate": 0.003, + "loss": 4.042, + "step": 29169 + }, + { + "epoch": 0.2917, + "grad_norm": 0.8682574314297339, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 29170 + }, + { + "epoch": 0.29171, + "grad_norm": 0.8597638417454839, + "learning_rate": 0.003, + "loss": 4.05, + "step": 29171 + }, + { + "epoch": 0.29172, + "grad_norm": 0.8230271795583927, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 29172 + }, + { + "epoch": 0.29173, + "grad_norm": 0.7930967870319119, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 29173 + }, + { + "epoch": 0.29174, + "grad_norm": 0.831662052335709, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 29174 + }, + { + "epoch": 0.29175, + "grad_norm": 1.0465677821426422, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 29175 + }, + { + "epoch": 0.29176, + "grad_norm": 1.1632088759592274, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 29176 + }, + { + "epoch": 0.29177, + "grad_norm": 0.8445603997645509, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 29177 + }, + { + "epoch": 0.29178, + "grad_norm": 0.8052434933886632, + "learning_rate": 0.003, + "loss": 4.036, + "step": 29178 + }, + { + "epoch": 0.29179, + "grad_norm": 0.7763277938392388, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 29179 + }, + { + "epoch": 0.2918, + "grad_norm": 0.8580652102447227, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 29180 + }, + { + "epoch": 0.29181, + "grad_norm": 0.9614079622945705, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 29181 + }, + { + "epoch": 0.29182, + "grad_norm": 0.9743996187795096, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 29182 + }, + { + "epoch": 0.29183, + "grad_norm": 1.0054756520846415, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 29183 + }, + { + "epoch": 0.29184, + "grad_norm": 0.932837054364316, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 29184 + }, + { + "epoch": 0.29185, + "grad_norm": 0.788973956754135, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 29185 + }, + { + "epoch": 0.29186, + "grad_norm": 0.6747679546183112, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 29186 + }, + { + "epoch": 0.29187, + "grad_norm": 0.6987433246156082, + "learning_rate": 0.003, + "loss": 4.029, + "step": 29187 + }, + { + "epoch": 0.29188, + "grad_norm": 0.657936457458693, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 29188 + }, + { + "epoch": 0.29189, + "grad_norm": 0.575536929539275, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 29189 + }, + { + "epoch": 0.2919, + "grad_norm": 0.7164055895233915, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 29190 + }, + { + "epoch": 0.29191, + "grad_norm": 0.8941036088036134, + "learning_rate": 0.003, + "loss": 4.03, + "step": 29191 + }, + { + "epoch": 0.29192, + "grad_norm": 0.8920879001504314, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 29192 + }, + { + "epoch": 0.29193, + "grad_norm": 0.8214872567383538, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 29193 + }, + { + "epoch": 0.29194, + "grad_norm": 0.9175091876800733, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 29194 + }, + { + "epoch": 0.29195, + "grad_norm": 0.9872106304650676, + "learning_rate": 0.003, + "loss": 4.016, + "step": 29195 + }, + { + "epoch": 0.29196, + "grad_norm": 1.1176477508342733, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 29196 + }, + { + "epoch": 0.29197, + "grad_norm": 0.9619487907506075, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 29197 + }, + { + "epoch": 0.29198, + "grad_norm": 0.9776565126326301, + "learning_rate": 0.003, + "loss": 4.0047, + "step": 29198 + }, + { + "epoch": 0.29199, + "grad_norm": 0.9949532674466596, + "learning_rate": 0.003, + "loss": 4.081, + "step": 29199 + }, + { + "epoch": 0.292, + "grad_norm": 1.0320563204471962, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 29200 + }, + { + "epoch": 0.29201, + "grad_norm": 0.9345325605721327, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 29201 + }, + { + "epoch": 0.29202, + "grad_norm": 1.0742316754594423, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 29202 + }, + { + "epoch": 0.29203, + "grad_norm": 1.0027997059284104, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 29203 + }, + { + "epoch": 0.29204, + "grad_norm": 0.928447195293581, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 29204 + }, + { + "epoch": 0.29205, + "grad_norm": 0.8456000591072352, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 29205 + }, + { + "epoch": 0.29206, + "grad_norm": 0.9844721961679435, + "learning_rate": 0.003, + "loss": 4.051, + "step": 29206 + }, + { + "epoch": 0.29207, + "grad_norm": 1.1335750333174595, + "learning_rate": 0.003, + "loss": 4.041, + "step": 29207 + }, + { + "epoch": 0.29208, + "grad_norm": 0.9075446818762009, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 29208 + }, + { + "epoch": 0.29209, + "grad_norm": 0.9809588662354749, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 29209 + }, + { + "epoch": 0.2921, + "grad_norm": 1.029200134176531, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 29210 + }, + { + "epoch": 0.29211, + "grad_norm": 0.868850439967874, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 29211 + }, + { + "epoch": 0.29212, + "grad_norm": 0.873708167758466, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 29212 + }, + { + "epoch": 0.29213, + "grad_norm": 0.8158447183498174, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 29213 + }, + { + "epoch": 0.29214, + "grad_norm": 0.6507164469894889, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 29214 + }, + { + "epoch": 0.29215, + "grad_norm": 0.6861763273810364, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 29215 + }, + { + "epoch": 0.29216, + "grad_norm": 0.7143574097076757, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 29216 + }, + { + "epoch": 0.29217, + "grad_norm": 0.7327039244228825, + "learning_rate": 0.003, + "loss": 4.048, + "step": 29217 + }, + { + "epoch": 0.29218, + "grad_norm": 0.6485960094681973, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 29218 + }, + { + "epoch": 0.29219, + "grad_norm": 0.6981930432351805, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 29219 + }, + { + "epoch": 0.2922, + "grad_norm": 0.6992820644232114, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 29220 + }, + { + "epoch": 0.29221, + "grad_norm": 0.6962057632360914, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 29221 + }, + { + "epoch": 0.29222, + "grad_norm": 0.6977666355584855, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 29222 + }, + { + "epoch": 0.29223, + "grad_norm": 0.7465474077780243, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 29223 + }, + { + "epoch": 0.29224, + "grad_norm": 0.9517453483707403, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 29224 + }, + { + "epoch": 0.29225, + "grad_norm": 1.1218142744214015, + "learning_rate": 0.003, + "loss": 4.009, + "step": 29225 + }, + { + "epoch": 0.29226, + "grad_norm": 0.9471998580493735, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 29226 + }, + { + "epoch": 0.29227, + "grad_norm": 0.853073856561557, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 29227 + }, + { + "epoch": 0.29228, + "grad_norm": 0.7099725582606511, + "learning_rate": 0.003, + "loss": 4.032, + "step": 29228 + }, + { + "epoch": 0.29229, + "grad_norm": 0.5629308853205258, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 29229 + }, + { + "epoch": 0.2923, + "grad_norm": 0.6419144848869232, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 29230 + }, + { + "epoch": 0.29231, + "grad_norm": 0.7551077407406445, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 29231 + }, + { + "epoch": 0.29232, + "grad_norm": 0.7626643978592605, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 29232 + }, + { + "epoch": 0.29233, + "grad_norm": 0.8590399794655413, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 29233 + }, + { + "epoch": 0.29234, + "grad_norm": 0.9664789537192814, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 29234 + }, + { + "epoch": 0.29235, + "grad_norm": 0.9508569048809632, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 29235 + }, + { + "epoch": 0.29236, + "grad_norm": 0.8572733667249588, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 29236 + }, + { + "epoch": 0.29237, + "grad_norm": 0.8374413998244363, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 29237 + }, + { + "epoch": 0.29238, + "grad_norm": 0.8735640718375238, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 29238 + }, + { + "epoch": 0.29239, + "grad_norm": 0.841671237794719, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 29239 + }, + { + "epoch": 0.2924, + "grad_norm": 0.7575902159207949, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 29240 + }, + { + "epoch": 0.29241, + "grad_norm": 0.7566043571661052, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 29241 + }, + { + "epoch": 0.29242, + "grad_norm": 0.79825792028567, + "learning_rate": 0.003, + "loss": 4.017, + "step": 29242 + }, + { + "epoch": 0.29243, + "grad_norm": 0.8159087056042108, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 29243 + }, + { + "epoch": 0.29244, + "grad_norm": 0.9249957025315871, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 29244 + }, + { + "epoch": 0.29245, + "grad_norm": 0.9701232118893561, + "learning_rate": 0.003, + "loss": 4.06, + "step": 29245 + }, + { + "epoch": 0.29246, + "grad_norm": 0.8872888400042878, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 29246 + }, + { + "epoch": 0.29247, + "grad_norm": 0.8944752668375463, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 29247 + }, + { + "epoch": 0.29248, + "grad_norm": 0.9139806627538845, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 29248 + }, + { + "epoch": 0.29249, + "grad_norm": 0.9963871026578371, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 29249 + }, + { + "epoch": 0.2925, + "grad_norm": 1.0543277604006016, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 29250 + }, + { + "epoch": 0.29251, + "grad_norm": 0.8844306037002795, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 29251 + }, + { + "epoch": 0.29252, + "grad_norm": 0.9673732563168388, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 29252 + }, + { + "epoch": 0.29253, + "grad_norm": 1.1085691494239849, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 29253 + }, + { + "epoch": 0.29254, + "grad_norm": 0.956399186563831, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 29254 + }, + { + "epoch": 0.29255, + "grad_norm": 0.981087886241805, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 29255 + }, + { + "epoch": 0.29256, + "grad_norm": 0.994106705551306, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 29256 + }, + { + "epoch": 0.29257, + "grad_norm": 0.9929782391556476, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 29257 + }, + { + "epoch": 0.29258, + "grad_norm": 1.0645426371806397, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 29258 + }, + { + "epoch": 0.29259, + "grad_norm": 1.049779212907228, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 29259 + }, + { + "epoch": 0.2926, + "grad_norm": 0.9297195795311273, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 29260 + }, + { + "epoch": 0.29261, + "grad_norm": 0.7790024594330531, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 29261 + }, + { + "epoch": 0.29262, + "grad_norm": 0.7555906336654221, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 29262 + }, + { + "epoch": 0.29263, + "grad_norm": 0.7760064421155114, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 29263 + }, + { + "epoch": 0.29264, + "grad_norm": 0.8353561432244543, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 29264 + }, + { + "epoch": 0.29265, + "grad_norm": 0.8007325510098213, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 29265 + }, + { + "epoch": 0.29266, + "grad_norm": 0.8022486526402812, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 29266 + }, + { + "epoch": 0.29267, + "grad_norm": 0.8601438121014888, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 29267 + }, + { + "epoch": 0.29268, + "grad_norm": 1.0729872349764427, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 29268 + }, + { + "epoch": 0.29269, + "grad_norm": 0.9725287227646507, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 29269 + }, + { + "epoch": 0.2927, + "grad_norm": 0.9554655999488532, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 29270 + }, + { + "epoch": 0.29271, + "grad_norm": 0.9392125538291688, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 29271 + }, + { + "epoch": 0.29272, + "grad_norm": 0.9184674691401653, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 29272 + }, + { + "epoch": 0.29273, + "grad_norm": 0.8498709279276783, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 29273 + }, + { + "epoch": 0.29274, + "grad_norm": 0.7657970703606856, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 29274 + }, + { + "epoch": 0.29275, + "grad_norm": 0.7396758246519036, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 29275 + }, + { + "epoch": 0.29276, + "grad_norm": 0.6328088429878861, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 29276 + }, + { + "epoch": 0.29277, + "grad_norm": 0.6582458836254869, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 29277 + }, + { + "epoch": 0.29278, + "grad_norm": 0.6649561771400734, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 29278 + }, + { + "epoch": 0.29279, + "grad_norm": 0.6536574785301233, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 29279 + }, + { + "epoch": 0.2928, + "grad_norm": 0.7517055026256518, + "learning_rate": 0.003, + "loss": 4.04, + "step": 29280 + }, + { + "epoch": 0.29281, + "grad_norm": 0.8912560338568013, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 29281 + }, + { + "epoch": 0.29282, + "grad_norm": 0.9959478073835422, + "learning_rate": 0.003, + "loss": 4.022, + "step": 29282 + }, + { + "epoch": 0.29283, + "grad_norm": 0.9046294925219313, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 29283 + }, + { + "epoch": 0.29284, + "grad_norm": 0.7797624875132041, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 29284 + }, + { + "epoch": 0.29285, + "grad_norm": 0.8077708132175788, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 29285 + }, + { + "epoch": 0.29286, + "grad_norm": 0.8695378672606314, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 29286 + }, + { + "epoch": 0.29287, + "grad_norm": 0.8987457946756661, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 29287 + }, + { + "epoch": 0.29288, + "grad_norm": 0.7780386503779303, + "learning_rate": 0.003, + "loss": 4.027, + "step": 29288 + }, + { + "epoch": 0.29289, + "grad_norm": 0.8709870923387596, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 29289 + }, + { + "epoch": 0.2929, + "grad_norm": 1.0456277059453967, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 29290 + }, + { + "epoch": 0.29291, + "grad_norm": 1.1723855708269368, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 29291 + }, + { + "epoch": 0.29292, + "grad_norm": 0.8104458824260034, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 29292 + }, + { + "epoch": 0.29293, + "grad_norm": 0.7300959197410518, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 29293 + }, + { + "epoch": 0.29294, + "grad_norm": 0.8325433941364638, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 29294 + }, + { + "epoch": 0.29295, + "grad_norm": 0.7752263757912774, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 29295 + }, + { + "epoch": 0.29296, + "grad_norm": 0.7330061442429696, + "learning_rate": 0.003, + "loss": 4.036, + "step": 29296 + }, + { + "epoch": 0.29297, + "grad_norm": 0.7606785513405115, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 29297 + }, + { + "epoch": 0.29298, + "grad_norm": 0.7752443979213537, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 29298 + }, + { + "epoch": 0.29299, + "grad_norm": 0.7163303971309438, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 29299 + }, + { + "epoch": 0.293, + "grad_norm": 0.7078724538567072, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 29300 + }, + { + "epoch": 0.29301, + "grad_norm": 0.7400479243784089, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 29301 + }, + { + "epoch": 0.29302, + "grad_norm": 0.614657902074472, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 29302 + }, + { + "epoch": 0.29303, + "grad_norm": 0.592817221258346, + "learning_rate": 0.003, + "loss": 4.0023, + "step": 29303 + }, + { + "epoch": 0.29304, + "grad_norm": 0.586983309838342, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 29304 + }, + { + "epoch": 0.29305, + "grad_norm": 0.672445884429573, + "learning_rate": 0.003, + "loss": 4.0067, + "step": 29305 + }, + { + "epoch": 0.29306, + "grad_norm": 0.8363999158206532, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 29306 + }, + { + "epoch": 0.29307, + "grad_norm": 0.9472930103111694, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 29307 + }, + { + "epoch": 0.29308, + "grad_norm": 1.1250665591571334, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 29308 + }, + { + "epoch": 0.29309, + "grad_norm": 0.891751435054052, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 29309 + }, + { + "epoch": 0.2931, + "grad_norm": 0.8581268495889038, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 29310 + }, + { + "epoch": 0.29311, + "grad_norm": 0.8691667252222027, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 29311 + }, + { + "epoch": 0.29312, + "grad_norm": 0.9536017613377106, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 29312 + }, + { + "epoch": 0.29313, + "grad_norm": 1.0694797475805728, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 29313 + }, + { + "epoch": 0.29314, + "grad_norm": 1.0938337418640525, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 29314 + }, + { + "epoch": 0.29315, + "grad_norm": 0.8457552211769384, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 29315 + }, + { + "epoch": 0.29316, + "grad_norm": 0.6826360826222817, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 29316 + }, + { + "epoch": 0.29317, + "grad_norm": 0.6417864421668424, + "learning_rate": 0.003, + "loss": 4.0029, + "step": 29317 + }, + { + "epoch": 0.29318, + "grad_norm": 0.6124779443548072, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 29318 + }, + { + "epoch": 0.29319, + "grad_norm": 0.6206130616811728, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 29319 + }, + { + "epoch": 0.2932, + "grad_norm": 0.7004393581641024, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 29320 + }, + { + "epoch": 0.29321, + "grad_norm": 0.8542966359168971, + "learning_rate": 0.003, + "loss": 4.031, + "step": 29321 + }, + { + "epoch": 0.29322, + "grad_norm": 0.893787775535247, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 29322 + }, + { + "epoch": 0.29323, + "grad_norm": 1.036254246874217, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 29323 + }, + { + "epoch": 0.29324, + "grad_norm": 1.1285848478345955, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 29324 + }, + { + "epoch": 0.29325, + "grad_norm": 1.0624341873589678, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 29325 + }, + { + "epoch": 0.29326, + "grad_norm": 1.1898100802304177, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 29326 + }, + { + "epoch": 0.29327, + "grad_norm": 0.9155271668904038, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 29327 + }, + { + "epoch": 0.29328, + "grad_norm": 0.7832101885107661, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 29328 + }, + { + "epoch": 0.29329, + "grad_norm": 0.8357660496198622, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 29329 + }, + { + "epoch": 0.2933, + "grad_norm": 0.8180783038957523, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 29330 + }, + { + "epoch": 0.29331, + "grad_norm": 0.766684286497782, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 29331 + }, + { + "epoch": 0.29332, + "grad_norm": 0.7822793183487627, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 29332 + }, + { + "epoch": 0.29333, + "grad_norm": 0.9624853512287003, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 29333 + }, + { + "epoch": 0.29334, + "grad_norm": 1.1986404088821028, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 29334 + }, + { + "epoch": 0.29335, + "grad_norm": 1.0079024707435384, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 29335 + }, + { + "epoch": 0.29336, + "grad_norm": 0.9257803558030068, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 29336 + }, + { + "epoch": 0.29337, + "grad_norm": 0.8121587682686504, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 29337 + }, + { + "epoch": 0.29338, + "grad_norm": 0.6169157464172522, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 29338 + }, + { + "epoch": 0.29339, + "grad_norm": 0.6479845919171563, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 29339 + }, + { + "epoch": 0.2934, + "grad_norm": 0.6429919865765577, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 29340 + }, + { + "epoch": 0.29341, + "grad_norm": 0.7489589063393629, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 29341 + }, + { + "epoch": 0.29342, + "grad_norm": 0.7818093763508773, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 29342 + }, + { + "epoch": 0.29343, + "grad_norm": 0.8723346775893264, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 29343 + }, + { + "epoch": 0.29344, + "grad_norm": 1.0294862172880013, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 29344 + }, + { + "epoch": 0.29345, + "grad_norm": 0.9509432984652021, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 29345 + }, + { + "epoch": 0.29346, + "grad_norm": 1.0070727284803271, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 29346 + }, + { + "epoch": 0.29347, + "grad_norm": 1.0465082248263042, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 29347 + }, + { + "epoch": 0.29348, + "grad_norm": 0.9302677982521874, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 29348 + }, + { + "epoch": 0.29349, + "grad_norm": 0.7910710682233116, + "learning_rate": 0.003, + "loss": 4.056, + "step": 29349 + }, + { + "epoch": 0.2935, + "grad_norm": 0.9211929392420392, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 29350 + }, + { + "epoch": 0.29351, + "grad_norm": 1.136217527111664, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 29351 + }, + { + "epoch": 0.29352, + "grad_norm": 0.9763175957886535, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 29352 + }, + { + "epoch": 0.29353, + "grad_norm": 1.0550535061560031, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 29353 + }, + { + "epoch": 0.29354, + "grad_norm": 1.1804677918962205, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 29354 + }, + { + "epoch": 0.29355, + "grad_norm": 0.8818082137386197, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 29355 + }, + { + "epoch": 0.29356, + "grad_norm": 0.8432497120911839, + "learning_rate": 0.003, + "loss": 4.067, + "step": 29356 + }, + { + "epoch": 0.29357, + "grad_norm": 0.8183324006555233, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 29357 + }, + { + "epoch": 0.29358, + "grad_norm": 0.8253824736379026, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 29358 + }, + { + "epoch": 0.29359, + "grad_norm": 0.7472111524234776, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 29359 + }, + { + "epoch": 0.2936, + "grad_norm": 0.6769350553960137, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 29360 + }, + { + "epoch": 0.29361, + "grad_norm": 0.8636802864298198, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 29361 + }, + { + "epoch": 0.29362, + "grad_norm": 0.8750581057134919, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 29362 + }, + { + "epoch": 0.29363, + "grad_norm": 0.700701774055599, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 29363 + }, + { + "epoch": 0.29364, + "grad_norm": 0.6499345477004443, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 29364 + }, + { + "epoch": 0.29365, + "grad_norm": 0.7373235784125733, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 29365 + }, + { + "epoch": 0.29366, + "grad_norm": 0.9383187433769475, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 29366 + }, + { + "epoch": 0.29367, + "grad_norm": 1.2180201920740534, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 29367 + }, + { + "epoch": 0.29368, + "grad_norm": 0.8603715946271335, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 29368 + }, + { + "epoch": 0.29369, + "grad_norm": 0.7783481370551975, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 29369 + }, + { + "epoch": 0.2937, + "grad_norm": 0.7053504760861421, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 29370 + }, + { + "epoch": 0.29371, + "grad_norm": 0.6381069701125877, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 29371 + }, + { + "epoch": 0.29372, + "grad_norm": 0.5681433003024388, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 29372 + }, + { + "epoch": 0.29373, + "grad_norm": 0.5562354470133768, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 29373 + }, + { + "epoch": 0.29374, + "grad_norm": 0.5521219069834646, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 29374 + }, + { + "epoch": 0.29375, + "grad_norm": 0.6518573580635623, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 29375 + }, + { + "epoch": 0.29376, + "grad_norm": 0.8445787062573374, + "learning_rate": 0.003, + "loss": 3.9676, + "step": 29376 + }, + { + "epoch": 0.29377, + "grad_norm": 1.0716315636604676, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 29377 + }, + { + "epoch": 0.29378, + "grad_norm": 0.8855734739817278, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 29378 + }, + { + "epoch": 0.29379, + "grad_norm": 0.7040999874748098, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 29379 + }, + { + "epoch": 0.2938, + "grad_norm": 0.7885734862809363, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 29380 + }, + { + "epoch": 0.29381, + "grad_norm": 0.8772073173017452, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 29381 + }, + { + "epoch": 0.29382, + "grad_norm": 1.1070383687332594, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 29382 + }, + { + "epoch": 0.29383, + "grad_norm": 1.1823486821085294, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 29383 + }, + { + "epoch": 0.29384, + "grad_norm": 0.824068428310071, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 29384 + }, + { + "epoch": 0.29385, + "grad_norm": 0.7665301938662346, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 29385 + }, + { + "epoch": 0.29386, + "grad_norm": 0.7195913168185638, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 29386 + }, + { + "epoch": 0.29387, + "grad_norm": 0.7467475163534808, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 29387 + }, + { + "epoch": 0.29388, + "grad_norm": 0.7378043214796863, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 29388 + }, + { + "epoch": 0.29389, + "grad_norm": 0.7729985333158655, + "learning_rate": 0.003, + "loss": 4.012, + "step": 29389 + }, + { + "epoch": 0.2939, + "grad_norm": 0.8485866748452497, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 29390 + }, + { + "epoch": 0.29391, + "grad_norm": 0.8846622982264443, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 29391 + }, + { + "epoch": 0.29392, + "grad_norm": 1.0386675904125229, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 29392 + }, + { + "epoch": 0.29393, + "grad_norm": 1.143344713470598, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 29393 + }, + { + "epoch": 0.29394, + "grad_norm": 0.8325357363399948, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 29394 + }, + { + "epoch": 0.29395, + "grad_norm": 0.8474904371237587, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 29395 + }, + { + "epoch": 0.29396, + "grad_norm": 0.9194393410525915, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 29396 + }, + { + "epoch": 0.29397, + "grad_norm": 1.1119271052996582, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 29397 + }, + { + "epoch": 0.29398, + "grad_norm": 0.890142378599785, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 29398 + }, + { + "epoch": 0.29399, + "grad_norm": 0.775042444165704, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 29399 + }, + { + "epoch": 0.294, + "grad_norm": 0.6977583946766327, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 29400 + }, + { + "epoch": 0.29401, + "grad_norm": 0.7195391011420256, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 29401 + }, + { + "epoch": 0.29402, + "grad_norm": 0.7237915768477647, + "learning_rate": 0.003, + "loss": 4.057, + "step": 29402 + }, + { + "epoch": 0.29403, + "grad_norm": 0.8111438671199099, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 29403 + }, + { + "epoch": 0.29404, + "grad_norm": 0.9547673336666775, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 29404 + }, + { + "epoch": 0.29405, + "grad_norm": 1.0508424483800807, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 29405 + }, + { + "epoch": 0.29406, + "grad_norm": 1.0430394877156017, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 29406 + }, + { + "epoch": 0.29407, + "grad_norm": 1.0257595835759235, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 29407 + }, + { + "epoch": 0.29408, + "grad_norm": 0.8901802891087471, + "learning_rate": 0.003, + "loss": 4.045, + "step": 29408 + }, + { + "epoch": 0.29409, + "grad_norm": 1.0341414318962376, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 29409 + }, + { + "epoch": 0.2941, + "grad_norm": 0.9075300764837801, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 29410 + }, + { + "epoch": 0.29411, + "grad_norm": 0.8329102074623471, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 29411 + }, + { + "epoch": 0.29412, + "grad_norm": 0.8968233295213194, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 29412 + }, + { + "epoch": 0.29413, + "grad_norm": 0.8382196147532294, + "learning_rate": 0.003, + "loss": 4.037, + "step": 29413 + }, + { + "epoch": 0.29414, + "grad_norm": 0.7916301118266401, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 29414 + }, + { + "epoch": 0.29415, + "grad_norm": 0.907773529494859, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 29415 + }, + { + "epoch": 0.29416, + "grad_norm": 1.091731432805627, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 29416 + }, + { + "epoch": 0.29417, + "grad_norm": 1.2279355011694542, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 29417 + }, + { + "epoch": 0.29418, + "grad_norm": 1.0364428282659894, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 29418 + }, + { + "epoch": 0.29419, + "grad_norm": 0.9712972652451504, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 29419 + }, + { + "epoch": 0.2942, + "grad_norm": 0.9781110015034683, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 29420 + }, + { + "epoch": 0.29421, + "grad_norm": 0.7647066938784287, + "learning_rate": 0.003, + "loss": 4.026, + "step": 29421 + }, + { + "epoch": 0.29422, + "grad_norm": 0.5502451154897172, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 29422 + }, + { + "epoch": 0.29423, + "grad_norm": 0.6075951211892019, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 29423 + }, + { + "epoch": 0.29424, + "grad_norm": 0.6464675368552271, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 29424 + }, + { + "epoch": 0.29425, + "grad_norm": 0.6627867712028837, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 29425 + }, + { + "epoch": 0.29426, + "grad_norm": 0.6846854367081188, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 29426 + }, + { + "epoch": 0.29427, + "grad_norm": 0.7681286337363664, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 29427 + }, + { + "epoch": 0.29428, + "grad_norm": 1.0320443371701227, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 29428 + }, + { + "epoch": 0.29429, + "grad_norm": 1.1669986666989423, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 29429 + }, + { + "epoch": 0.2943, + "grad_norm": 0.7351435156755731, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 29430 + }, + { + "epoch": 0.29431, + "grad_norm": 0.574233678938655, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 29431 + }, + { + "epoch": 0.29432, + "grad_norm": 0.5843227567201444, + "learning_rate": 0.003, + "loss": 4.019, + "step": 29432 + }, + { + "epoch": 0.29433, + "grad_norm": 0.6328376861445577, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 29433 + }, + { + "epoch": 0.29434, + "grad_norm": 0.7474010091636214, + "learning_rate": 0.003, + "loss": 4.044, + "step": 29434 + }, + { + "epoch": 0.29435, + "grad_norm": 0.8528713482052022, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 29435 + }, + { + "epoch": 0.29436, + "grad_norm": 0.8170653541615462, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 29436 + }, + { + "epoch": 0.29437, + "grad_norm": 0.7911538671197178, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 29437 + }, + { + "epoch": 0.29438, + "grad_norm": 0.8699160963102068, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 29438 + }, + { + "epoch": 0.29439, + "grad_norm": 0.9195430257919991, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 29439 + }, + { + "epoch": 0.2944, + "grad_norm": 0.8205631760201486, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 29440 + }, + { + "epoch": 0.29441, + "grad_norm": 0.771694743786183, + "learning_rate": 0.003, + "loss": 4.037, + "step": 29441 + }, + { + "epoch": 0.29442, + "grad_norm": 0.7973155023227254, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 29442 + }, + { + "epoch": 0.29443, + "grad_norm": 0.8943623151740967, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 29443 + }, + { + "epoch": 0.29444, + "grad_norm": 0.9399768421798038, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 29444 + }, + { + "epoch": 0.29445, + "grad_norm": 0.9923662002372997, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 29445 + }, + { + "epoch": 0.29446, + "grad_norm": 1.0405872044250775, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 29446 + }, + { + "epoch": 0.29447, + "grad_norm": 1.0471919222228883, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 29447 + }, + { + "epoch": 0.29448, + "grad_norm": 0.846721521217031, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 29448 + }, + { + "epoch": 0.29449, + "grad_norm": 0.7551628261238632, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 29449 + }, + { + "epoch": 0.2945, + "grad_norm": 0.8234291243578339, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 29450 + }, + { + "epoch": 0.29451, + "grad_norm": 0.8695330127680944, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 29451 + }, + { + "epoch": 0.29452, + "grad_norm": 0.9472059697091871, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 29452 + }, + { + "epoch": 0.29453, + "grad_norm": 1.1260293913305148, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 29453 + }, + { + "epoch": 0.29454, + "grad_norm": 1.040259203859668, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 29454 + }, + { + "epoch": 0.29455, + "grad_norm": 0.9312626050299808, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 29455 + }, + { + "epoch": 0.29456, + "grad_norm": 1.0108751514201586, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 29456 + }, + { + "epoch": 0.29457, + "grad_norm": 1.1150873160426766, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 29457 + }, + { + "epoch": 0.29458, + "grad_norm": 0.9282232771748145, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 29458 + }, + { + "epoch": 0.29459, + "grad_norm": 0.810583068691548, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 29459 + }, + { + "epoch": 0.2946, + "grad_norm": 0.8181249956960069, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 29460 + }, + { + "epoch": 0.29461, + "grad_norm": 0.7990932822464166, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 29461 + }, + { + "epoch": 0.29462, + "grad_norm": 0.9409787560412682, + "learning_rate": 0.003, + "loss": 4.021, + "step": 29462 + }, + { + "epoch": 0.29463, + "grad_norm": 1.0878902263865704, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 29463 + }, + { + "epoch": 0.29464, + "grad_norm": 0.8516730948613528, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 29464 + }, + { + "epoch": 0.29465, + "grad_norm": 0.8710536265885893, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 29465 + }, + { + "epoch": 0.29466, + "grad_norm": 0.8915727174850789, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 29466 + }, + { + "epoch": 0.29467, + "grad_norm": 0.8705364756496882, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 29467 + }, + { + "epoch": 0.29468, + "grad_norm": 0.8304403580999421, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 29468 + }, + { + "epoch": 0.29469, + "grad_norm": 0.7785606132144904, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 29469 + }, + { + "epoch": 0.2947, + "grad_norm": 0.8008128384464949, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 29470 + }, + { + "epoch": 0.29471, + "grad_norm": 0.8130658489116273, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 29471 + }, + { + "epoch": 0.29472, + "grad_norm": 0.7643610907829264, + "learning_rate": 0.003, + "loss": 4.0048, + "step": 29472 + }, + { + "epoch": 0.29473, + "grad_norm": 0.7873625835649938, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 29473 + }, + { + "epoch": 0.29474, + "grad_norm": 0.9768499358461032, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 29474 + }, + { + "epoch": 0.29475, + "grad_norm": 1.0664920194339051, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 29475 + }, + { + "epoch": 0.29476, + "grad_norm": 0.8198234745307579, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 29476 + }, + { + "epoch": 0.29477, + "grad_norm": 0.6999768683716316, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 29477 + }, + { + "epoch": 0.29478, + "grad_norm": 0.6698242942409547, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 29478 + }, + { + "epoch": 0.29479, + "grad_norm": 0.6647049597418161, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 29479 + }, + { + "epoch": 0.2948, + "grad_norm": 0.6677754405325208, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 29480 + }, + { + "epoch": 0.29481, + "grad_norm": 0.6995143862658786, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 29481 + }, + { + "epoch": 0.29482, + "grad_norm": 0.794002711475159, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 29482 + }, + { + "epoch": 0.29483, + "grad_norm": 0.9434788602165429, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 29483 + }, + { + "epoch": 0.29484, + "grad_norm": 1.192686185090285, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 29484 + }, + { + "epoch": 0.29485, + "grad_norm": 0.8815242194906802, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 29485 + }, + { + "epoch": 0.29486, + "grad_norm": 0.7657186120971496, + "learning_rate": 0.003, + "loss": 4.021, + "step": 29486 + }, + { + "epoch": 0.29487, + "grad_norm": 0.764947774133333, + "learning_rate": 0.003, + "loss": 4.031, + "step": 29487 + }, + { + "epoch": 0.29488, + "grad_norm": 0.7123030742452965, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 29488 + }, + { + "epoch": 0.29489, + "grad_norm": 0.6559101069447222, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 29489 + }, + { + "epoch": 0.2949, + "grad_norm": 0.5796461460955309, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 29490 + }, + { + "epoch": 0.29491, + "grad_norm": 0.6969441767256102, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 29491 + }, + { + "epoch": 0.29492, + "grad_norm": 0.6890163133223163, + "learning_rate": 0.003, + "loss": 4.008, + "step": 29492 + }, + { + "epoch": 0.29493, + "grad_norm": 0.807105101506135, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 29493 + }, + { + "epoch": 0.29494, + "grad_norm": 0.8991104524951704, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 29494 + }, + { + "epoch": 0.29495, + "grad_norm": 1.0338243230930393, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 29495 + }, + { + "epoch": 0.29496, + "grad_norm": 0.9642614448712948, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 29496 + }, + { + "epoch": 0.29497, + "grad_norm": 1.016237341079977, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 29497 + }, + { + "epoch": 0.29498, + "grad_norm": 0.9862010034761993, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 29498 + }, + { + "epoch": 0.29499, + "grad_norm": 0.9731648416861356, + "learning_rate": 0.003, + "loss": 4.021, + "step": 29499 + }, + { + "epoch": 0.295, + "grad_norm": 1.136399519482867, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 29500 + }, + { + "epoch": 0.29501, + "grad_norm": 0.942864178110427, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 29501 + }, + { + "epoch": 0.29502, + "grad_norm": 0.7623838200910764, + "learning_rate": 0.003, + "loss": 4.035, + "step": 29502 + }, + { + "epoch": 0.29503, + "grad_norm": 0.7160664731109688, + "learning_rate": 0.003, + "loss": 4.072, + "step": 29503 + }, + { + "epoch": 0.29504, + "grad_norm": 0.7421126285864131, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 29504 + }, + { + "epoch": 0.29505, + "grad_norm": 0.7319073303619947, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 29505 + }, + { + "epoch": 0.29506, + "grad_norm": 0.7526278658149351, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 29506 + }, + { + "epoch": 0.29507, + "grad_norm": 0.8191453196250793, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 29507 + }, + { + "epoch": 0.29508, + "grad_norm": 1.0857025274887944, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 29508 + }, + { + "epoch": 0.29509, + "grad_norm": 1.2150819596816411, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 29509 + }, + { + "epoch": 0.2951, + "grad_norm": 0.8190918531372475, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 29510 + }, + { + "epoch": 0.29511, + "grad_norm": 0.77128110683923, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 29511 + }, + { + "epoch": 0.29512, + "grad_norm": 0.7612889148525646, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 29512 + }, + { + "epoch": 0.29513, + "grad_norm": 0.734835710813353, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 29513 + }, + { + "epoch": 0.29514, + "grad_norm": 0.7764659037603447, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 29514 + }, + { + "epoch": 0.29515, + "grad_norm": 0.9024195163548533, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 29515 + }, + { + "epoch": 0.29516, + "grad_norm": 0.987506889096575, + "learning_rate": 0.003, + "loss": 4.045, + "step": 29516 + }, + { + "epoch": 0.29517, + "grad_norm": 1.0641078262301504, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 29517 + }, + { + "epoch": 0.29518, + "grad_norm": 0.8511848700763053, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 29518 + }, + { + "epoch": 0.29519, + "grad_norm": 0.8649927794523451, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 29519 + }, + { + "epoch": 0.2952, + "grad_norm": 0.8756882864815512, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 29520 + }, + { + "epoch": 0.29521, + "grad_norm": 0.879019446662677, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 29521 + }, + { + "epoch": 0.29522, + "grad_norm": 0.9051351209559111, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 29522 + }, + { + "epoch": 0.29523, + "grad_norm": 0.9299109368632689, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 29523 + }, + { + "epoch": 0.29524, + "grad_norm": 0.9656001235730147, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 29524 + }, + { + "epoch": 0.29525, + "grad_norm": 1.0264509223988085, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 29525 + }, + { + "epoch": 0.29526, + "grad_norm": 1.1808398670528966, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 29526 + }, + { + "epoch": 0.29527, + "grad_norm": 0.8023245450306955, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 29527 + }, + { + "epoch": 0.29528, + "grad_norm": 0.7123463170048762, + "learning_rate": 0.003, + "loss": 4.06, + "step": 29528 + }, + { + "epoch": 0.29529, + "grad_norm": 0.7463955513675214, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 29529 + }, + { + "epoch": 0.2953, + "grad_norm": 0.9121723700323058, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 29530 + }, + { + "epoch": 0.29531, + "grad_norm": 1.1925961869679653, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 29531 + }, + { + "epoch": 0.29532, + "grad_norm": 0.762055476458954, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 29532 + }, + { + "epoch": 0.29533, + "grad_norm": 0.661944938134374, + "learning_rate": 0.003, + "loss": 4.0087, + "step": 29533 + }, + { + "epoch": 0.29534, + "grad_norm": 0.575886306095579, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 29534 + }, + { + "epoch": 0.29535, + "grad_norm": 0.5472425238953568, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 29535 + }, + { + "epoch": 0.29536, + "grad_norm": 0.571062041663618, + "learning_rate": 0.003, + "loss": 3.9911, + "step": 29536 + }, + { + "epoch": 0.29537, + "grad_norm": 0.5921001628375261, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 29537 + }, + { + "epoch": 0.29538, + "grad_norm": 0.6551670610860285, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 29538 + }, + { + "epoch": 0.29539, + "grad_norm": 0.8374393810891719, + "learning_rate": 0.003, + "loss": 4.031, + "step": 29539 + }, + { + "epoch": 0.2954, + "grad_norm": 1.0128539041783893, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 29540 + }, + { + "epoch": 0.29541, + "grad_norm": 1.1120568398715585, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 29541 + }, + { + "epoch": 0.29542, + "grad_norm": 0.7087566315141395, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 29542 + }, + { + "epoch": 0.29543, + "grad_norm": 0.6455174430036086, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 29543 + }, + { + "epoch": 0.29544, + "grad_norm": 0.730123729795281, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 29544 + }, + { + "epoch": 0.29545, + "grad_norm": 0.7695499029463173, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 29545 + }, + { + "epoch": 0.29546, + "grad_norm": 0.798957318269689, + "learning_rate": 0.003, + "loss": 4.034, + "step": 29546 + }, + { + "epoch": 0.29547, + "grad_norm": 0.8734950830632281, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 29547 + }, + { + "epoch": 0.29548, + "grad_norm": 0.9887024410143668, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 29548 + }, + { + "epoch": 0.29549, + "grad_norm": 1.1180277087182828, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 29549 + }, + { + "epoch": 0.2955, + "grad_norm": 0.7743161580059043, + "learning_rate": 0.003, + "loss": 4.072, + "step": 29550 + }, + { + "epoch": 0.29551, + "grad_norm": 0.645265233569196, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 29551 + }, + { + "epoch": 0.29552, + "grad_norm": 0.7195815093661289, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 29552 + }, + { + "epoch": 0.29553, + "grad_norm": 0.9345691445009946, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 29553 + }, + { + "epoch": 0.29554, + "grad_norm": 0.9851289588394291, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 29554 + }, + { + "epoch": 0.29555, + "grad_norm": 0.8662162788211244, + "learning_rate": 0.003, + "loss": 4.0034, + "step": 29555 + }, + { + "epoch": 0.29556, + "grad_norm": 1.0285263698127753, + "learning_rate": 0.003, + "loss": 4.052, + "step": 29556 + }, + { + "epoch": 0.29557, + "grad_norm": 0.8204976746681296, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 29557 + }, + { + "epoch": 0.29558, + "grad_norm": 0.7493046369735843, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 29558 + }, + { + "epoch": 0.29559, + "grad_norm": 0.7747280826427463, + "learning_rate": 0.003, + "loss": 4.0033, + "step": 29559 + }, + { + "epoch": 0.2956, + "grad_norm": 0.8393597894478902, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 29560 + }, + { + "epoch": 0.29561, + "grad_norm": 0.9308004435293608, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 29561 + }, + { + "epoch": 0.29562, + "grad_norm": 0.9523914271200857, + "learning_rate": 0.003, + "loss": 4.054, + "step": 29562 + }, + { + "epoch": 0.29563, + "grad_norm": 0.9597397436633324, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 29563 + }, + { + "epoch": 0.29564, + "grad_norm": 1.0260306861923776, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 29564 + }, + { + "epoch": 0.29565, + "grad_norm": 0.9137918689610293, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 29565 + }, + { + "epoch": 0.29566, + "grad_norm": 0.8630218318084596, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 29566 + }, + { + "epoch": 0.29567, + "grad_norm": 0.9202201205435803, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 29567 + }, + { + "epoch": 0.29568, + "grad_norm": 0.9371638569610556, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 29568 + }, + { + "epoch": 0.29569, + "grad_norm": 1.0390643349735722, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 29569 + }, + { + "epoch": 0.2957, + "grad_norm": 1.1794776301400998, + "learning_rate": 0.003, + "loss": 4.0021, + "step": 29570 + }, + { + "epoch": 0.29571, + "grad_norm": 1.021249842684877, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 29571 + }, + { + "epoch": 0.29572, + "grad_norm": 1.0871462567103367, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 29572 + }, + { + "epoch": 0.29573, + "grad_norm": 1.0569978065315713, + "learning_rate": 0.003, + "loss": 4.048, + "step": 29573 + }, + { + "epoch": 0.29574, + "grad_norm": 0.926605531433748, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 29574 + }, + { + "epoch": 0.29575, + "grad_norm": 0.8473364328806434, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 29575 + }, + { + "epoch": 0.29576, + "grad_norm": 0.7467891036255305, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 29576 + }, + { + "epoch": 0.29577, + "grad_norm": 0.6888699641560188, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 29577 + }, + { + "epoch": 0.29578, + "grad_norm": 0.6611322621244413, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 29578 + }, + { + "epoch": 0.29579, + "grad_norm": 0.8588671885118704, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 29579 + }, + { + "epoch": 0.2958, + "grad_norm": 0.9365676143287244, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 29580 + }, + { + "epoch": 0.29581, + "grad_norm": 0.9759202804955065, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 29581 + }, + { + "epoch": 0.29582, + "grad_norm": 1.3194867649724762, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 29582 + }, + { + "epoch": 0.29583, + "grad_norm": 0.7109295383577998, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 29583 + }, + { + "epoch": 0.29584, + "grad_norm": 0.5887460743689457, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 29584 + }, + { + "epoch": 0.29585, + "grad_norm": 0.5219985540242765, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 29585 + }, + { + "epoch": 0.29586, + "grad_norm": 0.5503463955774311, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 29586 + }, + { + "epoch": 0.29587, + "grad_norm": 0.512112772950889, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 29587 + }, + { + "epoch": 0.29588, + "grad_norm": 0.5399087861647854, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 29588 + }, + { + "epoch": 0.29589, + "grad_norm": 0.527214607148675, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 29589 + }, + { + "epoch": 0.2959, + "grad_norm": 0.5526671339848339, + "learning_rate": 0.003, + "loss": 3.998, + "step": 29590 + }, + { + "epoch": 0.29591, + "grad_norm": 0.566958697794794, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 29591 + }, + { + "epoch": 0.29592, + "grad_norm": 0.5832809965410394, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 29592 + }, + { + "epoch": 0.29593, + "grad_norm": 0.5750473992025271, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 29593 + }, + { + "epoch": 0.29594, + "grad_norm": 0.6220475341371999, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 29594 + }, + { + "epoch": 0.29595, + "grad_norm": 0.6728802995895984, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 29595 + }, + { + "epoch": 0.29596, + "grad_norm": 0.8310651183092241, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 29596 + }, + { + "epoch": 0.29597, + "grad_norm": 1.1012629577376085, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 29597 + }, + { + "epoch": 0.29598, + "grad_norm": 1.0535211463857392, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 29598 + }, + { + "epoch": 0.29599, + "grad_norm": 1.0212072643068095, + "learning_rate": 0.003, + "loss": 4.025, + "step": 29599 + }, + { + "epoch": 0.296, + "grad_norm": 1.1297903558209657, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 29600 + }, + { + "epoch": 0.29601, + "grad_norm": 0.8741276178791385, + "learning_rate": 0.003, + "loss": 4.072, + "step": 29601 + }, + { + "epoch": 0.29602, + "grad_norm": 0.9073951972046345, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 29602 + }, + { + "epoch": 0.29603, + "grad_norm": 0.8217460421860436, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 29603 + }, + { + "epoch": 0.29604, + "grad_norm": 0.7398378463036518, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 29604 + }, + { + "epoch": 0.29605, + "grad_norm": 0.7949452650483415, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 29605 + }, + { + "epoch": 0.29606, + "grad_norm": 0.7633065173124948, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 29606 + }, + { + "epoch": 0.29607, + "grad_norm": 0.7829140020877731, + "learning_rate": 0.003, + "loss": 4.023, + "step": 29607 + }, + { + "epoch": 0.29608, + "grad_norm": 0.7938670416348678, + "learning_rate": 0.003, + "loss": 4.0053, + "step": 29608 + }, + { + "epoch": 0.29609, + "grad_norm": 0.8010392803898696, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 29609 + }, + { + "epoch": 0.2961, + "grad_norm": 0.9542051616240214, + "learning_rate": 0.003, + "loss": 4.04, + "step": 29610 + }, + { + "epoch": 0.29611, + "grad_norm": 1.1626236340796248, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 29611 + }, + { + "epoch": 0.29612, + "grad_norm": 0.8621593014942216, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 29612 + }, + { + "epoch": 0.29613, + "grad_norm": 0.7819868846487147, + "learning_rate": 0.003, + "loss": 4.0071, + "step": 29613 + }, + { + "epoch": 0.29614, + "grad_norm": 0.7932869759280725, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 29614 + }, + { + "epoch": 0.29615, + "grad_norm": 0.739249644562522, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 29615 + }, + { + "epoch": 0.29616, + "grad_norm": 0.7463235633943912, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 29616 + }, + { + "epoch": 0.29617, + "grad_norm": 0.6900108220946533, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 29617 + }, + { + "epoch": 0.29618, + "grad_norm": 0.7575690106096097, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 29618 + }, + { + "epoch": 0.29619, + "grad_norm": 0.8397644291261247, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 29619 + }, + { + "epoch": 0.2962, + "grad_norm": 0.9186349882590031, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 29620 + }, + { + "epoch": 0.29621, + "grad_norm": 1.0755308522755005, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 29621 + }, + { + "epoch": 0.29622, + "grad_norm": 1.1406148923172066, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 29622 + }, + { + "epoch": 0.29623, + "grad_norm": 0.9056012931374529, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 29623 + }, + { + "epoch": 0.29624, + "grad_norm": 0.8978287031207305, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 29624 + }, + { + "epoch": 0.29625, + "grad_norm": 0.8942775652892172, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 29625 + }, + { + "epoch": 0.29626, + "grad_norm": 0.8478470372000767, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 29626 + }, + { + "epoch": 0.29627, + "grad_norm": 0.8882974986771165, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 29627 + }, + { + "epoch": 0.29628, + "grad_norm": 1.0318189510158946, + "learning_rate": 0.003, + "loss": 4.043, + "step": 29628 + }, + { + "epoch": 0.29629, + "grad_norm": 0.9701998439781265, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 29629 + }, + { + "epoch": 0.2963, + "grad_norm": 1.0086135975062058, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 29630 + }, + { + "epoch": 0.29631, + "grad_norm": 1.0456520112074126, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 29631 + }, + { + "epoch": 0.29632, + "grad_norm": 1.122121201150599, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 29632 + }, + { + "epoch": 0.29633, + "grad_norm": 0.8964923787783627, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 29633 + }, + { + "epoch": 0.29634, + "grad_norm": 0.8502426772406847, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 29634 + }, + { + "epoch": 0.29635, + "grad_norm": 0.925780586940106, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 29635 + }, + { + "epoch": 0.29636, + "grad_norm": 1.0108082754823764, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 29636 + }, + { + "epoch": 0.29637, + "grad_norm": 1.152125571785256, + "learning_rate": 0.003, + "loss": 4.063, + "step": 29637 + }, + { + "epoch": 0.29638, + "grad_norm": 1.039164345116775, + "learning_rate": 0.003, + "loss": 4.053, + "step": 29638 + }, + { + "epoch": 0.29639, + "grad_norm": 0.955630958968111, + "learning_rate": 0.003, + "loss": 4.072, + "step": 29639 + }, + { + "epoch": 0.2964, + "grad_norm": 0.9457602089450506, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 29640 + }, + { + "epoch": 0.29641, + "grad_norm": 1.0631782794572842, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 29641 + }, + { + "epoch": 0.29642, + "grad_norm": 0.9835046624981021, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 29642 + }, + { + "epoch": 0.29643, + "grad_norm": 0.8803066089019086, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 29643 + }, + { + "epoch": 0.29644, + "grad_norm": 0.8602293082260116, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 29644 + }, + { + "epoch": 0.29645, + "grad_norm": 0.8418993402895806, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 29645 + }, + { + "epoch": 0.29646, + "grad_norm": 0.7526301468369274, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 29646 + }, + { + "epoch": 0.29647, + "grad_norm": 0.829762735849243, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 29647 + }, + { + "epoch": 0.29648, + "grad_norm": 0.9371174195527552, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 29648 + }, + { + "epoch": 0.29649, + "grad_norm": 1.009484719954905, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 29649 + }, + { + "epoch": 0.2965, + "grad_norm": 0.9406803127506687, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 29650 + }, + { + "epoch": 0.29651, + "grad_norm": 0.8824418327237745, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 29651 + }, + { + "epoch": 0.29652, + "grad_norm": 0.7726539682395782, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 29652 + }, + { + "epoch": 0.29653, + "grad_norm": 0.6614874259634501, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 29653 + }, + { + "epoch": 0.29654, + "grad_norm": 0.5928611140891672, + "learning_rate": 0.003, + "loss": 3.9935, + "step": 29654 + }, + { + "epoch": 0.29655, + "grad_norm": 0.6241851822305493, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 29655 + }, + { + "epoch": 0.29656, + "grad_norm": 0.5681418179543076, + "learning_rate": 0.003, + "loss": 4.0088, + "step": 29656 + }, + { + "epoch": 0.29657, + "grad_norm": 0.5364055499223117, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 29657 + }, + { + "epoch": 0.29658, + "grad_norm": 0.6370679706561624, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 29658 + }, + { + "epoch": 0.29659, + "grad_norm": 0.7268044673307017, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 29659 + }, + { + "epoch": 0.2966, + "grad_norm": 0.877045654676319, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 29660 + }, + { + "epoch": 0.29661, + "grad_norm": 1.1645647816576228, + "learning_rate": 0.003, + "loss": 4.04, + "step": 29661 + }, + { + "epoch": 0.29662, + "grad_norm": 1.1408081566642048, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 29662 + }, + { + "epoch": 0.29663, + "grad_norm": 0.8514601043194153, + "learning_rate": 0.003, + "loss": 4.045, + "step": 29663 + }, + { + "epoch": 0.29664, + "grad_norm": 0.7343051062249578, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 29664 + }, + { + "epoch": 0.29665, + "grad_norm": 0.6913645925867937, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 29665 + }, + { + "epoch": 0.29666, + "grad_norm": 0.7321238508139923, + "learning_rate": 0.003, + "loss": 4.047, + "step": 29666 + }, + { + "epoch": 0.29667, + "grad_norm": 0.6982298018406868, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 29667 + }, + { + "epoch": 0.29668, + "grad_norm": 0.7136694579138156, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 29668 + }, + { + "epoch": 0.29669, + "grad_norm": 0.6637359125409228, + "learning_rate": 0.003, + "loss": 4.045, + "step": 29669 + }, + { + "epoch": 0.2967, + "grad_norm": 0.6883563983666308, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 29670 + }, + { + "epoch": 0.29671, + "grad_norm": 0.7136494169718673, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 29671 + }, + { + "epoch": 0.29672, + "grad_norm": 0.7947545752396443, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 29672 + }, + { + "epoch": 0.29673, + "grad_norm": 0.7154622820917592, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 29673 + }, + { + "epoch": 0.29674, + "grad_norm": 0.7607427849331586, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 29674 + }, + { + "epoch": 0.29675, + "grad_norm": 0.8763483258759387, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 29675 + }, + { + "epoch": 0.29676, + "grad_norm": 1.2380102699713451, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 29676 + }, + { + "epoch": 0.29677, + "grad_norm": 0.7851799758030761, + "learning_rate": 0.003, + "loss": 4.023, + "step": 29677 + }, + { + "epoch": 0.29678, + "grad_norm": 0.6055775967261222, + "learning_rate": 0.003, + "loss": 3.9948, + "step": 29678 + }, + { + "epoch": 0.29679, + "grad_norm": 0.6331845486836994, + "learning_rate": 0.003, + "loss": 4.0031, + "step": 29679 + }, + { + "epoch": 0.2968, + "grad_norm": 0.7430470795238474, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 29680 + }, + { + "epoch": 0.29681, + "grad_norm": 0.7624076017508807, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 29681 + }, + { + "epoch": 0.29682, + "grad_norm": 0.61983155564648, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 29682 + }, + { + "epoch": 0.29683, + "grad_norm": 0.6328813671032738, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 29683 + }, + { + "epoch": 0.29684, + "grad_norm": 0.6576067957676136, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 29684 + }, + { + "epoch": 0.29685, + "grad_norm": 0.8308586745323387, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 29685 + }, + { + "epoch": 0.29686, + "grad_norm": 1.0608540377470554, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 29686 + }, + { + "epoch": 0.29687, + "grad_norm": 1.1731593338542414, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 29687 + }, + { + "epoch": 0.29688, + "grad_norm": 0.8705715812090735, + "learning_rate": 0.003, + "loss": 4.03, + "step": 29688 + }, + { + "epoch": 0.29689, + "grad_norm": 0.8875891068838223, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 29689 + }, + { + "epoch": 0.2969, + "grad_norm": 0.9099328437632714, + "learning_rate": 0.003, + "loss": 4.033, + "step": 29690 + }, + { + "epoch": 0.29691, + "grad_norm": 0.9163390680700281, + "learning_rate": 0.003, + "loss": 4.039, + "step": 29691 + }, + { + "epoch": 0.29692, + "grad_norm": 1.1507738729704846, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 29692 + }, + { + "epoch": 0.29693, + "grad_norm": 1.0673164314393269, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 29693 + }, + { + "epoch": 0.29694, + "grad_norm": 0.971512945628094, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 29694 + }, + { + "epoch": 0.29695, + "grad_norm": 1.109622578772811, + "learning_rate": 0.003, + "loss": 4.054, + "step": 29695 + }, + { + "epoch": 0.29696, + "grad_norm": 0.9342161206080827, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 29696 + }, + { + "epoch": 0.29697, + "grad_norm": 0.9385147288680579, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 29697 + }, + { + "epoch": 0.29698, + "grad_norm": 0.8946363528941353, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 29698 + }, + { + "epoch": 0.29699, + "grad_norm": 0.820235416769547, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 29699 + }, + { + "epoch": 0.297, + "grad_norm": 0.79023674161317, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 29700 + }, + { + "epoch": 0.29701, + "grad_norm": 0.8408153240941739, + "learning_rate": 0.003, + "loss": 4.057, + "step": 29701 + }, + { + "epoch": 0.29702, + "grad_norm": 0.9916062068131645, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 29702 + }, + { + "epoch": 0.29703, + "grad_norm": 1.2658707477408067, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 29703 + }, + { + "epoch": 0.29704, + "grad_norm": 0.8569388924458262, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 29704 + }, + { + "epoch": 0.29705, + "grad_norm": 0.8709509263416317, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 29705 + }, + { + "epoch": 0.29706, + "grad_norm": 0.9274583416516791, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 29706 + }, + { + "epoch": 0.29707, + "grad_norm": 0.9953752983054379, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 29707 + }, + { + "epoch": 0.29708, + "grad_norm": 1.0154630634355237, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 29708 + }, + { + "epoch": 0.29709, + "grad_norm": 1.0324954467148522, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 29709 + }, + { + "epoch": 0.2971, + "grad_norm": 0.9814300142787983, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 29710 + }, + { + "epoch": 0.29711, + "grad_norm": 0.9308114907413595, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 29711 + }, + { + "epoch": 0.29712, + "grad_norm": 0.8404837084850487, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 29712 + }, + { + "epoch": 0.29713, + "grad_norm": 0.8147618442565345, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 29713 + }, + { + "epoch": 0.29714, + "grad_norm": 0.7916539372622257, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 29714 + }, + { + "epoch": 0.29715, + "grad_norm": 0.8010260484088131, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 29715 + }, + { + "epoch": 0.29716, + "grad_norm": 0.7744813003608569, + "learning_rate": 0.003, + "loss": 4.069, + "step": 29716 + }, + { + "epoch": 0.29717, + "grad_norm": 0.6615541597781733, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 29717 + }, + { + "epoch": 0.29718, + "grad_norm": 0.5891056875590933, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 29718 + }, + { + "epoch": 0.29719, + "grad_norm": 0.6109715601698277, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 29719 + }, + { + "epoch": 0.2972, + "grad_norm": 0.6719435882520997, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 29720 + }, + { + "epoch": 0.29721, + "grad_norm": 0.7399503064204112, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 29721 + }, + { + "epoch": 0.29722, + "grad_norm": 0.8881285731177666, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 29722 + }, + { + "epoch": 0.29723, + "grad_norm": 1.2063009855350686, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 29723 + }, + { + "epoch": 0.29724, + "grad_norm": 1.0564501177997567, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 29724 + }, + { + "epoch": 0.29725, + "grad_norm": 0.7714319217486869, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 29725 + }, + { + "epoch": 0.29726, + "grad_norm": 0.6979303292405438, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 29726 + }, + { + "epoch": 0.29727, + "grad_norm": 0.776882368729348, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 29727 + }, + { + "epoch": 0.29728, + "grad_norm": 0.8735100440246475, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 29728 + }, + { + "epoch": 0.29729, + "grad_norm": 0.884608600652728, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 29729 + }, + { + "epoch": 0.2973, + "grad_norm": 0.7983200596192327, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 29730 + }, + { + "epoch": 0.29731, + "grad_norm": 0.7306407924064392, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 29731 + }, + { + "epoch": 0.29732, + "grad_norm": 0.7587832174618953, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 29732 + }, + { + "epoch": 0.29733, + "grad_norm": 0.8929159716862467, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 29733 + }, + { + "epoch": 0.29734, + "grad_norm": 1.0030335108818575, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 29734 + }, + { + "epoch": 0.29735, + "grad_norm": 1.1285104505245875, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 29735 + }, + { + "epoch": 0.29736, + "grad_norm": 0.9399870544681409, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 29736 + }, + { + "epoch": 0.29737, + "grad_norm": 0.8652065996898423, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 29737 + }, + { + "epoch": 0.29738, + "grad_norm": 0.8237999093397274, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 29738 + }, + { + "epoch": 0.29739, + "grad_norm": 0.7739115685825558, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 29739 + }, + { + "epoch": 0.2974, + "grad_norm": 0.7212454050258579, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 29740 + }, + { + "epoch": 0.29741, + "grad_norm": 0.7450060867478219, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 29741 + }, + { + "epoch": 0.29742, + "grad_norm": 0.7510429669549749, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 29742 + }, + { + "epoch": 0.29743, + "grad_norm": 0.7324361765145662, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 29743 + }, + { + "epoch": 0.29744, + "grad_norm": 0.7823156763480396, + "learning_rate": 0.003, + "loss": 4.058, + "step": 29744 + }, + { + "epoch": 0.29745, + "grad_norm": 1.0275229981848362, + "learning_rate": 0.003, + "loss": 4.039, + "step": 29745 + }, + { + "epoch": 0.29746, + "grad_norm": 1.1153082681017585, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 29746 + }, + { + "epoch": 0.29747, + "grad_norm": 1.08832744829383, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 29747 + }, + { + "epoch": 0.29748, + "grad_norm": 1.0440713005434106, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 29748 + }, + { + "epoch": 0.29749, + "grad_norm": 0.9321793036595923, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 29749 + }, + { + "epoch": 0.2975, + "grad_norm": 0.8829255908689442, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 29750 + }, + { + "epoch": 0.29751, + "grad_norm": 0.8967871059237603, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 29751 + }, + { + "epoch": 0.29752, + "grad_norm": 0.753552348299697, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 29752 + }, + { + "epoch": 0.29753, + "grad_norm": 0.6062601552451126, + "learning_rate": 0.003, + "loss": 4.036, + "step": 29753 + }, + { + "epoch": 0.29754, + "grad_norm": 0.6339606128262646, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 29754 + }, + { + "epoch": 0.29755, + "grad_norm": 0.7230749238072391, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 29755 + }, + { + "epoch": 0.29756, + "grad_norm": 0.7657519147767696, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 29756 + }, + { + "epoch": 0.29757, + "grad_norm": 0.7666752713901773, + "learning_rate": 0.003, + "loss": 4.014, + "step": 29757 + }, + { + "epoch": 0.29758, + "grad_norm": 0.8096820214067828, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 29758 + }, + { + "epoch": 0.29759, + "grad_norm": 0.8480195564404693, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 29759 + }, + { + "epoch": 0.2976, + "grad_norm": 0.8755426884377995, + "learning_rate": 0.003, + "loss": 4.068, + "step": 29760 + }, + { + "epoch": 0.29761, + "grad_norm": 0.9025945835045597, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 29761 + }, + { + "epoch": 0.29762, + "grad_norm": 0.893742559643703, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 29762 + }, + { + "epoch": 0.29763, + "grad_norm": 0.7638140155680376, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 29763 + }, + { + "epoch": 0.29764, + "grad_norm": 0.6696205205087604, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 29764 + }, + { + "epoch": 0.29765, + "grad_norm": 0.6402113653291301, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 29765 + }, + { + "epoch": 0.29766, + "grad_norm": 0.6889426009158199, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 29766 + }, + { + "epoch": 0.29767, + "grad_norm": 0.7717369554544314, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 29767 + }, + { + "epoch": 0.29768, + "grad_norm": 0.936639939699774, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 29768 + }, + { + "epoch": 0.29769, + "grad_norm": 1.221681340990684, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 29769 + }, + { + "epoch": 0.2977, + "grad_norm": 1.0265064931081562, + "learning_rate": 0.003, + "loss": 4.044, + "step": 29770 + }, + { + "epoch": 0.29771, + "grad_norm": 1.0054701270809359, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 29771 + }, + { + "epoch": 0.29772, + "grad_norm": 1.002019850211728, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 29772 + }, + { + "epoch": 0.29773, + "grad_norm": 1.0561698708408154, + "learning_rate": 0.003, + "loss": 4.055, + "step": 29773 + }, + { + "epoch": 0.29774, + "grad_norm": 1.1316651053086748, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 29774 + }, + { + "epoch": 0.29775, + "grad_norm": 0.8368753114257649, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 29775 + }, + { + "epoch": 0.29776, + "grad_norm": 0.8055798512404035, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 29776 + }, + { + "epoch": 0.29777, + "grad_norm": 0.6899477198209971, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 29777 + }, + { + "epoch": 0.29778, + "grad_norm": 0.6523284086009201, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 29778 + }, + { + "epoch": 0.29779, + "grad_norm": 0.7239401618591073, + "learning_rate": 0.003, + "loss": 4.05, + "step": 29779 + }, + { + "epoch": 0.2978, + "grad_norm": 0.9664884900967454, + "learning_rate": 0.003, + "loss": 4.026, + "step": 29780 + }, + { + "epoch": 0.29781, + "grad_norm": 1.234128046077109, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 29781 + }, + { + "epoch": 0.29782, + "grad_norm": 0.6854677600327224, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 29782 + }, + { + "epoch": 0.29783, + "grad_norm": 0.6821932066191768, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 29783 + }, + { + "epoch": 0.29784, + "grad_norm": 0.7334846343839895, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 29784 + }, + { + "epoch": 0.29785, + "grad_norm": 0.8613948084461409, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 29785 + }, + { + "epoch": 0.29786, + "grad_norm": 0.985413525952653, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 29786 + }, + { + "epoch": 0.29787, + "grad_norm": 0.9268528271207865, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 29787 + }, + { + "epoch": 0.29788, + "grad_norm": 0.8233064234641196, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 29788 + }, + { + "epoch": 0.29789, + "grad_norm": 0.6614358515098475, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 29789 + }, + { + "epoch": 0.2979, + "grad_norm": 0.5946823989392257, + "learning_rate": 0.003, + "loss": 3.9966, + "step": 29790 + }, + { + "epoch": 0.29791, + "grad_norm": 0.6311183531736815, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 29791 + }, + { + "epoch": 0.29792, + "grad_norm": 0.59783935842209, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 29792 + }, + { + "epoch": 0.29793, + "grad_norm": 0.6020949066879727, + "learning_rate": 0.003, + "loss": 4.031, + "step": 29793 + }, + { + "epoch": 0.29794, + "grad_norm": 0.7097508471024112, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 29794 + }, + { + "epoch": 0.29795, + "grad_norm": 0.7379849122948166, + "learning_rate": 0.003, + "loss": 4.075, + "step": 29795 + }, + { + "epoch": 0.29796, + "grad_norm": 0.670652202591052, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 29796 + }, + { + "epoch": 0.29797, + "grad_norm": 0.7749702787846947, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 29797 + }, + { + "epoch": 0.29798, + "grad_norm": 1.063582357180149, + "learning_rate": 0.003, + "loss": 4.016, + "step": 29798 + }, + { + "epoch": 0.29799, + "grad_norm": 1.292542836392054, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 29799 + }, + { + "epoch": 0.298, + "grad_norm": 0.9577123938686618, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 29800 + }, + { + "epoch": 0.29801, + "grad_norm": 0.8765144754295019, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 29801 + }, + { + "epoch": 0.29802, + "grad_norm": 0.8599844172808233, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 29802 + }, + { + "epoch": 0.29803, + "grad_norm": 0.8152042706777664, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 29803 + }, + { + "epoch": 0.29804, + "grad_norm": 0.8856433268858597, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 29804 + }, + { + "epoch": 0.29805, + "grad_norm": 0.8435965168644539, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 29805 + }, + { + "epoch": 0.29806, + "grad_norm": 0.8959856743660108, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 29806 + }, + { + "epoch": 0.29807, + "grad_norm": 1.0808917490144245, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 29807 + }, + { + "epoch": 0.29808, + "grad_norm": 1.1442659220970852, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 29808 + }, + { + "epoch": 0.29809, + "grad_norm": 0.8071600642590813, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 29809 + }, + { + "epoch": 0.2981, + "grad_norm": 0.7152612517749717, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 29810 + }, + { + "epoch": 0.29811, + "grad_norm": 0.6785983090888845, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 29811 + }, + { + "epoch": 0.29812, + "grad_norm": 0.7761444871738625, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 29812 + }, + { + "epoch": 0.29813, + "grad_norm": 0.8941966045957692, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 29813 + }, + { + "epoch": 0.29814, + "grad_norm": 0.9566636395768782, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 29814 + }, + { + "epoch": 0.29815, + "grad_norm": 0.8949460660896343, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 29815 + }, + { + "epoch": 0.29816, + "grad_norm": 0.8807018634127046, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 29816 + }, + { + "epoch": 0.29817, + "grad_norm": 0.9091484631748709, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 29817 + }, + { + "epoch": 0.29818, + "grad_norm": 0.8323951210594167, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 29818 + }, + { + "epoch": 0.29819, + "grad_norm": 0.78892133536652, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 29819 + }, + { + "epoch": 0.2982, + "grad_norm": 0.9387053370219199, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 29820 + }, + { + "epoch": 0.29821, + "grad_norm": 1.0921604932230289, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 29821 + }, + { + "epoch": 0.29822, + "grad_norm": 1.1616808463752424, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 29822 + }, + { + "epoch": 0.29823, + "grad_norm": 1.094287442163826, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 29823 + }, + { + "epoch": 0.29824, + "grad_norm": 0.920875056610523, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 29824 + }, + { + "epoch": 0.29825, + "grad_norm": 0.8919604620100988, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 29825 + }, + { + "epoch": 0.29826, + "grad_norm": 0.8167006147210444, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 29826 + }, + { + "epoch": 0.29827, + "grad_norm": 0.7773772443523552, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 29827 + }, + { + "epoch": 0.29828, + "grad_norm": 0.7473722054195873, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 29828 + }, + { + "epoch": 0.29829, + "grad_norm": 0.7392400462911206, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 29829 + }, + { + "epoch": 0.2983, + "grad_norm": 0.7231372275007345, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 29830 + }, + { + "epoch": 0.29831, + "grad_norm": 0.7199135791670837, + "learning_rate": 0.003, + "loss": 4.04, + "step": 29831 + }, + { + "epoch": 0.29832, + "grad_norm": 0.6646203814122272, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 29832 + }, + { + "epoch": 0.29833, + "grad_norm": 0.6631942943293005, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 29833 + }, + { + "epoch": 0.29834, + "grad_norm": 0.6975576911842695, + "learning_rate": 0.003, + "loss": 4.017, + "step": 29834 + }, + { + "epoch": 0.29835, + "grad_norm": 0.7875529936579908, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 29835 + }, + { + "epoch": 0.29836, + "grad_norm": 0.9741851736856987, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 29836 + }, + { + "epoch": 0.29837, + "grad_norm": 1.2564165716035691, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 29837 + }, + { + "epoch": 0.29838, + "grad_norm": 0.8070417678265821, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 29838 + }, + { + "epoch": 0.29839, + "grad_norm": 0.804747374111468, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 29839 + }, + { + "epoch": 0.2984, + "grad_norm": 0.8284540291126762, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 29840 + }, + { + "epoch": 0.29841, + "grad_norm": 0.8879353465508759, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 29841 + }, + { + "epoch": 0.29842, + "grad_norm": 0.9637084195746578, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 29842 + }, + { + "epoch": 0.29843, + "grad_norm": 1.1791934203295928, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 29843 + }, + { + "epoch": 0.29844, + "grad_norm": 0.9962279758395227, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 29844 + }, + { + "epoch": 0.29845, + "grad_norm": 0.9919586575413125, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 29845 + }, + { + "epoch": 0.29846, + "grad_norm": 0.8406675494918255, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 29846 + }, + { + "epoch": 0.29847, + "grad_norm": 0.8924895216014683, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 29847 + }, + { + "epoch": 0.29848, + "grad_norm": 0.8365651684846891, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 29848 + }, + { + "epoch": 0.29849, + "grad_norm": 0.7551172971270768, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 29849 + }, + { + "epoch": 0.2985, + "grad_norm": 0.7054970422772004, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 29850 + }, + { + "epoch": 0.29851, + "grad_norm": 0.7740689053334437, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 29851 + }, + { + "epoch": 0.29852, + "grad_norm": 0.8214511195633418, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 29852 + }, + { + "epoch": 0.29853, + "grad_norm": 0.8886683379826029, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 29853 + }, + { + "epoch": 0.29854, + "grad_norm": 0.890682058220869, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 29854 + }, + { + "epoch": 0.29855, + "grad_norm": 0.878638618349842, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 29855 + }, + { + "epoch": 0.29856, + "grad_norm": 0.8556718474217165, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 29856 + }, + { + "epoch": 0.29857, + "grad_norm": 0.7865331634679883, + "learning_rate": 0.003, + "loss": 4.044, + "step": 29857 + }, + { + "epoch": 0.29858, + "grad_norm": 0.6969737706676679, + "learning_rate": 0.003, + "loss": 4.0057, + "step": 29858 + }, + { + "epoch": 0.29859, + "grad_norm": 0.7176812968847659, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 29859 + }, + { + "epoch": 0.2986, + "grad_norm": 0.7675759177330358, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 29860 + }, + { + "epoch": 0.29861, + "grad_norm": 0.8460140861729517, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 29861 + }, + { + "epoch": 0.29862, + "grad_norm": 0.9167122648219999, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 29862 + }, + { + "epoch": 0.29863, + "grad_norm": 0.9762692650823959, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 29863 + }, + { + "epoch": 0.29864, + "grad_norm": 0.9376137328640115, + "learning_rate": 0.003, + "loss": 4.047, + "step": 29864 + }, + { + "epoch": 0.29865, + "grad_norm": 0.8126793268141304, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 29865 + }, + { + "epoch": 0.29866, + "grad_norm": 0.8916374748439551, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 29866 + }, + { + "epoch": 0.29867, + "grad_norm": 1.0591532666196584, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 29867 + }, + { + "epoch": 0.29868, + "grad_norm": 0.8566645971235454, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 29868 + }, + { + "epoch": 0.29869, + "grad_norm": 0.9214805970588473, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 29869 + }, + { + "epoch": 0.2987, + "grad_norm": 0.9768597323913967, + "learning_rate": 0.003, + "loss": 4.049, + "step": 29870 + }, + { + "epoch": 0.29871, + "grad_norm": 1.2148625421734747, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 29871 + }, + { + "epoch": 0.29872, + "grad_norm": 0.8861620808704939, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 29872 + }, + { + "epoch": 0.29873, + "grad_norm": 1.0495765569360282, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 29873 + }, + { + "epoch": 0.29874, + "grad_norm": 1.2499065934797244, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 29874 + }, + { + "epoch": 0.29875, + "grad_norm": 0.8983482008505086, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 29875 + }, + { + "epoch": 0.29876, + "grad_norm": 0.7463817159816309, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 29876 + }, + { + "epoch": 0.29877, + "grad_norm": 0.7265966741137838, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 29877 + }, + { + "epoch": 0.29878, + "grad_norm": 0.7652893105424238, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 29878 + }, + { + "epoch": 0.29879, + "grad_norm": 0.7406606801530248, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 29879 + }, + { + "epoch": 0.2988, + "grad_norm": 0.7154487724549269, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 29880 + }, + { + "epoch": 0.29881, + "grad_norm": 0.6241907660807905, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 29881 + }, + { + "epoch": 0.29882, + "grad_norm": 0.579905717645663, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 29882 + }, + { + "epoch": 0.29883, + "grad_norm": 0.7079610043041767, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 29883 + }, + { + "epoch": 0.29884, + "grad_norm": 0.7990126907752872, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 29884 + }, + { + "epoch": 0.29885, + "grad_norm": 0.7710028849610616, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 29885 + }, + { + "epoch": 0.29886, + "grad_norm": 0.8454477443145967, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 29886 + }, + { + "epoch": 0.29887, + "grad_norm": 0.963456315302519, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 29887 + }, + { + "epoch": 0.29888, + "grad_norm": 1.1026455706808131, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 29888 + }, + { + "epoch": 0.29889, + "grad_norm": 0.9625173267258381, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 29889 + }, + { + "epoch": 0.2989, + "grad_norm": 0.8687887031763422, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 29890 + }, + { + "epoch": 0.29891, + "grad_norm": 0.8327960752553026, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 29891 + }, + { + "epoch": 0.29892, + "grad_norm": 0.858909363576543, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 29892 + }, + { + "epoch": 0.29893, + "grad_norm": 0.9295240799826819, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 29893 + }, + { + "epoch": 0.29894, + "grad_norm": 0.9531787993857671, + "learning_rate": 0.003, + "loss": 4.0072, + "step": 29894 + }, + { + "epoch": 0.29895, + "grad_norm": 0.866574533435851, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 29895 + }, + { + "epoch": 0.29896, + "grad_norm": 0.7907549954130869, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 29896 + }, + { + "epoch": 0.29897, + "grad_norm": 0.7368963353290511, + "learning_rate": 0.003, + "loss": 4.044, + "step": 29897 + }, + { + "epoch": 0.29898, + "grad_norm": 0.7499534276875517, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 29898 + }, + { + "epoch": 0.29899, + "grad_norm": 0.7496316512911848, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 29899 + }, + { + "epoch": 0.299, + "grad_norm": 0.6656815949009053, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 29900 + }, + { + "epoch": 0.29901, + "grad_norm": 0.5736562769253083, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 29901 + }, + { + "epoch": 0.29902, + "grad_norm": 0.6805079792796459, + "learning_rate": 0.003, + "loss": 4.0084, + "step": 29902 + }, + { + "epoch": 0.29903, + "grad_norm": 0.6792499144842693, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 29903 + }, + { + "epoch": 0.29904, + "grad_norm": 0.7372448998185561, + "learning_rate": 0.003, + "loss": 4.026, + "step": 29904 + }, + { + "epoch": 0.29905, + "grad_norm": 0.6928200517307427, + "learning_rate": 0.003, + "loss": 4.043, + "step": 29905 + }, + { + "epoch": 0.29906, + "grad_norm": 0.634728372704992, + "learning_rate": 0.003, + "loss": 4.042, + "step": 29906 + }, + { + "epoch": 0.29907, + "grad_norm": 0.6884889133685475, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 29907 + }, + { + "epoch": 0.29908, + "grad_norm": 1.0646283360060134, + "learning_rate": 0.003, + "loss": 4.029, + "step": 29908 + }, + { + "epoch": 0.29909, + "grad_norm": 1.3348607803611459, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 29909 + }, + { + "epoch": 0.2991, + "grad_norm": 0.8486129291997292, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 29910 + }, + { + "epoch": 0.29911, + "grad_norm": 0.8465249503174112, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 29911 + }, + { + "epoch": 0.29912, + "grad_norm": 0.752963220887893, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 29912 + }, + { + "epoch": 0.29913, + "grad_norm": 0.8543463648778149, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 29913 + }, + { + "epoch": 0.29914, + "grad_norm": 0.9941153775002424, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 29914 + }, + { + "epoch": 0.29915, + "grad_norm": 1.0720961998067813, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 29915 + }, + { + "epoch": 0.29916, + "grad_norm": 0.9556205503100462, + "learning_rate": 0.003, + "loss": 3.9936, + "step": 29916 + }, + { + "epoch": 0.29917, + "grad_norm": 1.0700270465746613, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 29917 + }, + { + "epoch": 0.29918, + "grad_norm": 0.9735479859008586, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 29918 + }, + { + "epoch": 0.29919, + "grad_norm": 1.0367887442185864, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 29919 + }, + { + "epoch": 0.2992, + "grad_norm": 1.0273658836334438, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 29920 + }, + { + "epoch": 0.29921, + "grad_norm": 0.979507571105691, + "learning_rate": 0.003, + "loss": 4.059, + "step": 29921 + }, + { + "epoch": 0.29922, + "grad_norm": 0.9635805317346876, + "learning_rate": 0.003, + "loss": 4.054, + "step": 29922 + }, + { + "epoch": 0.29923, + "grad_norm": 0.9385501536787636, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 29923 + }, + { + "epoch": 0.29924, + "grad_norm": 1.0714777942234008, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 29924 + }, + { + "epoch": 0.29925, + "grad_norm": 0.9782301712148836, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 29925 + }, + { + "epoch": 0.29926, + "grad_norm": 1.0704170947388072, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 29926 + }, + { + "epoch": 0.29927, + "grad_norm": 1.1082807110806554, + "learning_rate": 0.003, + "loss": 4.042, + "step": 29927 + }, + { + "epoch": 0.29928, + "grad_norm": 0.9662183686340001, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 29928 + }, + { + "epoch": 0.29929, + "grad_norm": 0.9828129847523678, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 29929 + }, + { + "epoch": 0.2993, + "grad_norm": 1.0141080316087163, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 29930 + }, + { + "epoch": 0.29931, + "grad_norm": 1.0603941127250183, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 29931 + }, + { + "epoch": 0.29932, + "grad_norm": 0.9385482367956242, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 29932 + }, + { + "epoch": 0.29933, + "grad_norm": 0.9495843704594316, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 29933 + }, + { + "epoch": 0.29934, + "grad_norm": 0.7621890274308901, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 29934 + }, + { + "epoch": 0.29935, + "grad_norm": 0.670148891910428, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 29935 + }, + { + "epoch": 0.29936, + "grad_norm": 0.6918341289448267, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 29936 + }, + { + "epoch": 0.29937, + "grad_norm": 0.7300500019729539, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 29937 + }, + { + "epoch": 0.29938, + "grad_norm": 0.7996003949546515, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 29938 + }, + { + "epoch": 0.29939, + "grad_norm": 0.9009504036121483, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 29939 + }, + { + "epoch": 0.2994, + "grad_norm": 0.8967614383509817, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 29940 + }, + { + "epoch": 0.29941, + "grad_norm": 0.7486581461859741, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 29941 + }, + { + "epoch": 0.29942, + "grad_norm": 0.5436960414793199, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 29942 + }, + { + "epoch": 0.29943, + "grad_norm": 0.5433321346212, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 29943 + }, + { + "epoch": 0.29944, + "grad_norm": 0.6040885457355956, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 29944 + }, + { + "epoch": 0.29945, + "grad_norm": 0.6307588023026995, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 29945 + }, + { + "epoch": 0.29946, + "grad_norm": 0.8505343185913659, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 29946 + }, + { + "epoch": 0.29947, + "grad_norm": 1.0491865569042322, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 29947 + }, + { + "epoch": 0.29948, + "grad_norm": 1.0088624339509338, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 29948 + }, + { + "epoch": 0.29949, + "grad_norm": 0.9793667573466214, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 29949 + }, + { + "epoch": 0.2995, + "grad_norm": 0.9532289106721306, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 29950 + }, + { + "epoch": 0.29951, + "grad_norm": 0.8423690843277375, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 29951 + }, + { + "epoch": 0.29952, + "grad_norm": 0.8127863827334167, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 29952 + }, + { + "epoch": 0.29953, + "grad_norm": 0.7224316627746021, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 29953 + }, + { + "epoch": 0.29954, + "grad_norm": 0.7349568619746814, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 29954 + }, + { + "epoch": 0.29955, + "grad_norm": 0.7351471009496151, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 29955 + }, + { + "epoch": 0.29956, + "grad_norm": 0.9009173971834606, + "learning_rate": 0.003, + "loss": 4.035, + "step": 29956 + }, + { + "epoch": 0.29957, + "grad_norm": 1.0389833900958094, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 29957 + }, + { + "epoch": 0.29958, + "grad_norm": 1.1435333394755678, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 29958 + }, + { + "epoch": 0.29959, + "grad_norm": 0.8518454227039677, + "learning_rate": 0.003, + "loss": 4.044, + "step": 29959 + }, + { + "epoch": 0.2996, + "grad_norm": 0.7615978861368772, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 29960 + }, + { + "epoch": 0.29961, + "grad_norm": 0.7716144330057941, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 29961 + }, + { + "epoch": 0.29962, + "grad_norm": 0.731413426793074, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 29962 + }, + { + "epoch": 0.29963, + "grad_norm": 0.7031853443533272, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 29963 + }, + { + "epoch": 0.29964, + "grad_norm": 0.7450221290428228, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 29964 + }, + { + "epoch": 0.29965, + "grad_norm": 0.8064980273704023, + "learning_rate": 0.003, + "loss": 4.039, + "step": 29965 + }, + { + "epoch": 0.29966, + "grad_norm": 0.7292102000814651, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 29966 + }, + { + "epoch": 0.29967, + "grad_norm": 0.7208088492121769, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 29967 + }, + { + "epoch": 0.29968, + "grad_norm": 0.6357198280191361, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 29968 + }, + { + "epoch": 0.29969, + "grad_norm": 0.5511699932749419, + "learning_rate": 0.003, + "loss": 3.9986, + "step": 29969 + }, + { + "epoch": 0.2997, + "grad_norm": 0.5153403811395589, + "learning_rate": 0.003, + "loss": 4.059, + "step": 29970 + }, + { + "epoch": 0.29971, + "grad_norm": 0.5528168609732497, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 29971 + }, + { + "epoch": 0.29972, + "grad_norm": 0.716017557646997, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 29972 + }, + { + "epoch": 0.29973, + "grad_norm": 1.0034460219563752, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 29973 + }, + { + "epoch": 0.29974, + "grad_norm": 1.4411631734064885, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 29974 + }, + { + "epoch": 0.29975, + "grad_norm": 0.6570003140002783, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 29975 + }, + { + "epoch": 0.29976, + "grad_norm": 0.8045359237880182, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 29976 + }, + { + "epoch": 0.29977, + "grad_norm": 0.8432241108520697, + "learning_rate": 0.003, + "loss": 4.02, + "step": 29977 + }, + { + "epoch": 0.29978, + "grad_norm": 0.8720170248075246, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 29978 + }, + { + "epoch": 0.29979, + "grad_norm": 0.9519789925672366, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 29979 + }, + { + "epoch": 0.2998, + "grad_norm": 0.9440240167604893, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 29980 + }, + { + "epoch": 0.29981, + "grad_norm": 1.0088462072322921, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 29981 + }, + { + "epoch": 0.29982, + "grad_norm": 0.9533193437388763, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 29982 + }, + { + "epoch": 0.29983, + "grad_norm": 0.843125832374633, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 29983 + }, + { + "epoch": 0.29984, + "grad_norm": 0.7335017039971817, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 29984 + }, + { + "epoch": 0.29985, + "grad_norm": 0.708453652750939, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 29985 + }, + { + "epoch": 0.29986, + "grad_norm": 0.697392187517962, + "learning_rate": 0.003, + "loss": 4.047, + "step": 29986 + }, + { + "epoch": 0.29987, + "grad_norm": 0.8052542261238657, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 29987 + }, + { + "epoch": 0.29988, + "grad_norm": 0.8563985358895021, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 29988 + }, + { + "epoch": 0.29989, + "grad_norm": 1.0290446469115981, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 29989 + }, + { + "epoch": 0.2999, + "grad_norm": 1.0788640441323787, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 29990 + }, + { + "epoch": 0.29991, + "grad_norm": 0.7971295929192805, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 29991 + }, + { + "epoch": 0.29992, + "grad_norm": 0.7549997091463418, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 29992 + }, + { + "epoch": 0.29993, + "grad_norm": 0.8524769593365019, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 29993 + }, + { + "epoch": 0.29994, + "grad_norm": 0.8710290558718577, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 29994 + }, + { + "epoch": 0.29995, + "grad_norm": 0.8760789689982463, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 29995 + }, + { + "epoch": 0.29996, + "grad_norm": 0.9096928474588732, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 29996 + }, + { + "epoch": 0.29997, + "grad_norm": 1.013286824238396, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 29997 + }, + { + "epoch": 0.29998, + "grad_norm": 0.9934113186633183, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 29998 + }, + { + "epoch": 0.29999, + "grad_norm": 0.9104835937467876, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 29999 + }, + { + "epoch": 0.3, + "grad_norm": 0.7609670676207376, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 30000 + }, + { + "epoch": 0.30001, + "grad_norm": 0.6768705519191272, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 30001 + }, + { + "epoch": 0.30002, + "grad_norm": 0.7216189318349105, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 30002 + }, + { + "epoch": 0.30003, + "grad_norm": 0.9337982162119006, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 30003 + }, + { + "epoch": 0.30004, + "grad_norm": 1.1581049278023678, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 30004 + }, + { + "epoch": 0.30005, + "grad_norm": 1.1340406382539954, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 30005 + }, + { + "epoch": 0.30006, + "grad_norm": 0.9686000265289199, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 30006 + }, + { + "epoch": 0.30007, + "grad_norm": 0.865684303729371, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 30007 + }, + { + "epoch": 0.30008, + "grad_norm": 0.8801501423520437, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 30008 + }, + { + "epoch": 0.30009, + "grad_norm": 0.8907726531299617, + "learning_rate": 0.003, + "loss": 4.046, + "step": 30009 + }, + { + "epoch": 0.3001, + "grad_norm": 0.9057286780090243, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 30010 + }, + { + "epoch": 0.30011, + "grad_norm": 0.9722334373874111, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 30011 + }, + { + "epoch": 0.30012, + "grad_norm": 1.146644988575812, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 30012 + }, + { + "epoch": 0.30013, + "grad_norm": 1.0382624693738378, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 30013 + }, + { + "epoch": 0.30014, + "grad_norm": 0.9624670762005234, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 30014 + }, + { + "epoch": 0.30015, + "grad_norm": 0.9467457493940971, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 30015 + }, + { + "epoch": 0.30016, + "grad_norm": 0.9637719141145777, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 30016 + }, + { + "epoch": 0.30017, + "grad_norm": 0.8037783126999698, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 30017 + }, + { + "epoch": 0.30018, + "grad_norm": 0.6689509918660139, + "learning_rate": 0.003, + "loss": 4.059, + "step": 30018 + }, + { + "epoch": 0.30019, + "grad_norm": 0.6859254473787996, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 30019 + }, + { + "epoch": 0.3002, + "grad_norm": 0.6623547301896632, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 30020 + }, + { + "epoch": 0.30021, + "grad_norm": 0.7579636175985185, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 30021 + }, + { + "epoch": 0.30022, + "grad_norm": 0.9090252729828222, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 30022 + }, + { + "epoch": 0.30023, + "grad_norm": 0.8618629308045245, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 30023 + }, + { + "epoch": 0.30024, + "grad_norm": 0.7750880138993158, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 30024 + }, + { + "epoch": 0.30025, + "grad_norm": 0.7043935545754563, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 30025 + }, + { + "epoch": 0.30026, + "grad_norm": 0.5462093703042328, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 30026 + }, + { + "epoch": 0.30027, + "grad_norm": 0.579464940457767, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 30027 + }, + { + "epoch": 0.30028, + "grad_norm": 0.5619803956517676, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 30028 + }, + { + "epoch": 0.30029, + "grad_norm": 0.683528488963088, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 30029 + }, + { + "epoch": 0.3003, + "grad_norm": 0.9005339668751994, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 30030 + }, + { + "epoch": 0.30031, + "grad_norm": 1.043034063549366, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 30031 + }, + { + "epoch": 0.30032, + "grad_norm": 1.1044415073978013, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 30032 + }, + { + "epoch": 0.30033, + "grad_norm": 0.8515047874466991, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 30033 + }, + { + "epoch": 0.30034, + "grad_norm": 0.8079167024001153, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 30034 + }, + { + "epoch": 0.30035, + "grad_norm": 0.8223760540839545, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 30035 + }, + { + "epoch": 0.30036, + "grad_norm": 0.8243023067115568, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 30036 + }, + { + "epoch": 0.30037, + "grad_norm": 0.9364052224680736, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 30037 + }, + { + "epoch": 0.30038, + "grad_norm": 0.9859726026801926, + "learning_rate": 0.003, + "loss": 4.063, + "step": 30038 + }, + { + "epoch": 0.30039, + "grad_norm": 0.8828738888731624, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 30039 + }, + { + "epoch": 0.3004, + "grad_norm": 0.9749440159725465, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 30040 + }, + { + "epoch": 0.30041, + "grad_norm": 1.0180256552220674, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 30041 + }, + { + "epoch": 0.30042, + "grad_norm": 1.052450809911558, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 30042 + }, + { + "epoch": 0.30043, + "grad_norm": 1.0022914870255615, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 30043 + }, + { + "epoch": 0.30044, + "grad_norm": 1.0583587673789436, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 30044 + }, + { + "epoch": 0.30045, + "grad_norm": 0.8797160954048673, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 30045 + }, + { + "epoch": 0.30046, + "grad_norm": 0.9060523224018905, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 30046 + }, + { + "epoch": 0.30047, + "grad_norm": 0.921091882592109, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 30047 + }, + { + "epoch": 0.30048, + "grad_norm": 0.8249637446581346, + "learning_rate": 0.003, + "loss": 4.075, + "step": 30048 + }, + { + "epoch": 0.30049, + "grad_norm": 0.6975417003859311, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 30049 + }, + { + "epoch": 0.3005, + "grad_norm": 0.7535937565190751, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 30050 + }, + { + "epoch": 0.30051, + "grad_norm": 0.8743596691093144, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 30051 + }, + { + "epoch": 0.30052, + "grad_norm": 0.9364444064689101, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 30052 + }, + { + "epoch": 0.30053, + "grad_norm": 0.9282894316145495, + "learning_rate": 0.003, + "loss": 4.047, + "step": 30053 + }, + { + "epoch": 0.30054, + "grad_norm": 0.9510638857677045, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 30054 + }, + { + "epoch": 0.30055, + "grad_norm": 0.8523663704086185, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 30055 + }, + { + "epoch": 0.30056, + "grad_norm": 0.7916935711883694, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 30056 + }, + { + "epoch": 0.30057, + "grad_norm": 0.8037230615048561, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 30057 + }, + { + "epoch": 0.30058, + "grad_norm": 0.8615380437480775, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 30058 + }, + { + "epoch": 0.30059, + "grad_norm": 0.9911226831859725, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 30059 + }, + { + "epoch": 0.3006, + "grad_norm": 0.9722316069959992, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 30060 + }, + { + "epoch": 0.30061, + "grad_norm": 1.0000912636679056, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 30061 + }, + { + "epoch": 0.30062, + "grad_norm": 0.9951263532939444, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 30062 + }, + { + "epoch": 0.30063, + "grad_norm": 0.9800900643328759, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 30063 + }, + { + "epoch": 0.30064, + "grad_norm": 0.918794953737637, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 30064 + }, + { + "epoch": 0.30065, + "grad_norm": 0.7944156457114094, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 30065 + }, + { + "epoch": 0.30066, + "grad_norm": 0.7729617448687248, + "learning_rate": 0.003, + "loss": 4.078, + "step": 30066 + }, + { + "epoch": 0.30067, + "grad_norm": 0.7213423191874977, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 30067 + }, + { + "epoch": 0.30068, + "grad_norm": 0.6726827151221186, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 30068 + }, + { + "epoch": 0.30069, + "grad_norm": 0.6141110949903028, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 30069 + }, + { + "epoch": 0.3007, + "grad_norm": 0.7229540208822735, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 30070 + }, + { + "epoch": 0.30071, + "grad_norm": 0.8984671497992637, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 30071 + }, + { + "epoch": 0.30072, + "grad_norm": 1.1410116392740988, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 30072 + }, + { + "epoch": 0.30073, + "grad_norm": 0.8385558605503886, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 30073 + }, + { + "epoch": 0.30074, + "grad_norm": 0.7145560590076628, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 30074 + }, + { + "epoch": 0.30075, + "grad_norm": 0.6364971444796947, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 30075 + }, + { + "epoch": 0.30076, + "grad_norm": 0.6066845628687378, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 30076 + }, + { + "epoch": 0.30077, + "grad_norm": 0.5408152101135294, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 30077 + }, + { + "epoch": 0.30078, + "grad_norm": 0.5071662683503553, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 30078 + }, + { + "epoch": 0.30079, + "grad_norm": 0.5615718819642903, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 30079 + }, + { + "epoch": 0.3008, + "grad_norm": 0.6540390855159591, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 30080 + }, + { + "epoch": 0.30081, + "grad_norm": 0.8398230736417415, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 30081 + }, + { + "epoch": 0.30082, + "grad_norm": 1.0588878078917954, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 30082 + }, + { + "epoch": 0.30083, + "grad_norm": 1.0024313744365674, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 30083 + }, + { + "epoch": 0.30084, + "grad_norm": 0.864016386433271, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 30084 + }, + { + "epoch": 0.30085, + "grad_norm": 0.7840385847514313, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 30085 + }, + { + "epoch": 0.30086, + "grad_norm": 0.8237240552770042, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 30086 + }, + { + "epoch": 0.30087, + "grad_norm": 0.79118066110452, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 30087 + }, + { + "epoch": 0.30088, + "grad_norm": 0.7528140503604116, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 30088 + }, + { + "epoch": 0.30089, + "grad_norm": 0.8292117519622972, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 30089 + }, + { + "epoch": 0.3009, + "grad_norm": 0.8863435333810046, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 30090 + }, + { + "epoch": 0.30091, + "grad_norm": 0.9316574911148339, + "learning_rate": 0.003, + "loss": 4.065, + "step": 30091 + }, + { + "epoch": 0.30092, + "grad_norm": 0.913764352276469, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 30092 + }, + { + "epoch": 0.30093, + "grad_norm": 0.8692387064950584, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 30093 + }, + { + "epoch": 0.30094, + "grad_norm": 0.970055132569459, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 30094 + }, + { + "epoch": 0.30095, + "grad_norm": 1.0863418822648088, + "learning_rate": 0.003, + "loss": 4.034, + "step": 30095 + }, + { + "epoch": 0.30096, + "grad_norm": 1.0232357345895409, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 30096 + }, + { + "epoch": 0.30097, + "grad_norm": 0.8713014856312566, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 30097 + }, + { + "epoch": 0.30098, + "grad_norm": 0.8981817705233588, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 30098 + }, + { + "epoch": 0.30099, + "grad_norm": 1.0820389679329667, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 30099 + }, + { + "epoch": 0.301, + "grad_norm": 1.0762350415117965, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 30100 + }, + { + "epoch": 0.30101, + "grad_norm": 0.9973509471455523, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 30101 + }, + { + "epoch": 0.30102, + "grad_norm": 1.1506205516157868, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 30102 + }, + { + "epoch": 0.30103, + "grad_norm": 0.9283518604922325, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 30103 + }, + { + "epoch": 0.30104, + "grad_norm": 0.9400148061894855, + "learning_rate": 0.003, + "loss": 4.052, + "step": 30104 + }, + { + "epoch": 0.30105, + "grad_norm": 1.0016286824167613, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 30105 + }, + { + "epoch": 0.30106, + "grad_norm": 1.1977642133734585, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 30106 + }, + { + "epoch": 0.30107, + "grad_norm": 0.8362791462005004, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 30107 + }, + { + "epoch": 0.30108, + "grad_norm": 0.7907209557495114, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 30108 + }, + { + "epoch": 0.30109, + "grad_norm": 0.6190337375909825, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 30109 + }, + { + "epoch": 0.3011, + "grad_norm": 0.6450194041014916, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 30110 + }, + { + "epoch": 0.30111, + "grad_norm": 0.6587941590689612, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 30111 + }, + { + "epoch": 0.30112, + "grad_norm": 0.7835331809286532, + "learning_rate": 0.003, + "loss": 4.034, + "step": 30112 + }, + { + "epoch": 0.30113, + "grad_norm": 0.8973886184932217, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 30113 + }, + { + "epoch": 0.30114, + "grad_norm": 0.9794868722981024, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 30114 + }, + { + "epoch": 0.30115, + "grad_norm": 0.9573664396017678, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 30115 + }, + { + "epoch": 0.30116, + "grad_norm": 0.8013979782937286, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 30116 + }, + { + "epoch": 0.30117, + "grad_norm": 0.746306562452, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 30117 + }, + { + "epoch": 0.30118, + "grad_norm": 0.7345993729588842, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 30118 + }, + { + "epoch": 0.30119, + "grad_norm": 0.6604456319685357, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 30119 + }, + { + "epoch": 0.3012, + "grad_norm": 0.6078471200141904, + "learning_rate": 0.003, + "loss": 4.064, + "step": 30120 + }, + { + "epoch": 0.30121, + "grad_norm": 0.6048428354305295, + "learning_rate": 0.003, + "loss": 4.038, + "step": 30121 + }, + { + "epoch": 0.30122, + "grad_norm": 0.5751053806824694, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 30122 + }, + { + "epoch": 0.30123, + "grad_norm": 0.5825817751715745, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 30123 + }, + { + "epoch": 0.30124, + "grad_norm": 0.6554904891482146, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 30124 + }, + { + "epoch": 0.30125, + "grad_norm": 0.850841628446038, + "learning_rate": 0.003, + "loss": 4.029, + "step": 30125 + }, + { + "epoch": 0.30126, + "grad_norm": 1.1698886941979998, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 30126 + }, + { + "epoch": 0.30127, + "grad_norm": 0.806427010350725, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 30127 + }, + { + "epoch": 0.30128, + "grad_norm": 0.6618090482210965, + "learning_rate": 0.003, + "loss": 4.051, + "step": 30128 + }, + { + "epoch": 0.30129, + "grad_norm": 0.6811450009370348, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 30129 + }, + { + "epoch": 0.3013, + "grad_norm": 0.7127479194248231, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 30130 + }, + { + "epoch": 0.30131, + "grad_norm": 0.7401118890625202, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 30131 + }, + { + "epoch": 0.30132, + "grad_norm": 0.7366464220984607, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 30132 + }, + { + "epoch": 0.30133, + "grad_norm": 0.8846841795565, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 30133 + }, + { + "epoch": 0.30134, + "grad_norm": 1.207185552503816, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 30134 + }, + { + "epoch": 0.30135, + "grad_norm": 0.7622856521279255, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 30135 + }, + { + "epoch": 0.30136, + "grad_norm": 0.5716734051600992, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 30136 + }, + { + "epoch": 0.30137, + "grad_norm": 0.729130944485432, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 30137 + }, + { + "epoch": 0.30138, + "grad_norm": 0.8494605358873717, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 30138 + }, + { + "epoch": 0.30139, + "grad_norm": 1.0416513872681878, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 30139 + }, + { + "epoch": 0.3014, + "grad_norm": 1.0420772417366526, + "learning_rate": 0.003, + "loss": 4.0015, + "step": 30140 + }, + { + "epoch": 0.30141, + "grad_norm": 1.0008164486738604, + "learning_rate": 0.003, + "loss": 4.025, + "step": 30141 + }, + { + "epoch": 0.30142, + "grad_norm": 0.8860170151281823, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 30142 + }, + { + "epoch": 0.30143, + "grad_norm": 0.8499583971903932, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 30143 + }, + { + "epoch": 0.30144, + "grad_norm": 0.8726028574911808, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 30144 + }, + { + "epoch": 0.30145, + "grad_norm": 0.9528791970936563, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 30145 + }, + { + "epoch": 0.30146, + "grad_norm": 0.9923317792665013, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 30146 + }, + { + "epoch": 0.30147, + "grad_norm": 1.1399615488088233, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 30147 + }, + { + "epoch": 0.30148, + "grad_norm": 1.015608599080371, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 30148 + }, + { + "epoch": 0.30149, + "grad_norm": 1.1761185165508, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 30149 + }, + { + "epoch": 0.3015, + "grad_norm": 1.095759960703571, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 30150 + }, + { + "epoch": 0.30151, + "grad_norm": 0.7492310058232573, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 30151 + }, + { + "epoch": 0.30152, + "grad_norm": 0.7508516214228315, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 30152 + }, + { + "epoch": 0.30153, + "grad_norm": 0.73165040518065, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 30153 + }, + { + "epoch": 0.30154, + "grad_norm": 0.8101928026200462, + "learning_rate": 0.003, + "loss": 4.041, + "step": 30154 + }, + { + "epoch": 0.30155, + "grad_norm": 0.9078439647495635, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 30155 + }, + { + "epoch": 0.30156, + "grad_norm": 1.0113830363444647, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 30156 + }, + { + "epoch": 0.30157, + "grad_norm": 1.1460778397011255, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 30157 + }, + { + "epoch": 0.30158, + "grad_norm": 0.9390622507499444, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 30158 + }, + { + "epoch": 0.30159, + "grad_norm": 0.8884700711819042, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 30159 + }, + { + "epoch": 0.3016, + "grad_norm": 0.8129163416528855, + "learning_rate": 0.003, + "loss": 4.071, + "step": 30160 + }, + { + "epoch": 0.30161, + "grad_norm": 0.7179065489472536, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 30161 + }, + { + "epoch": 0.30162, + "grad_norm": 0.6229724760344895, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 30162 + }, + { + "epoch": 0.30163, + "grad_norm": 0.5872493987647046, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 30163 + }, + { + "epoch": 0.30164, + "grad_norm": 0.5262716817864452, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 30164 + }, + { + "epoch": 0.30165, + "grad_norm": 0.48924516695168063, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 30165 + }, + { + "epoch": 0.30166, + "grad_norm": 0.5219311979934258, + "learning_rate": 0.003, + "loss": 4.0, + "step": 30166 + }, + { + "epoch": 0.30167, + "grad_norm": 0.617220437680268, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 30167 + }, + { + "epoch": 0.30168, + "grad_norm": 0.6731686921783068, + "learning_rate": 0.003, + "loss": 4.043, + "step": 30168 + }, + { + "epoch": 0.30169, + "grad_norm": 0.8028200768431429, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 30169 + }, + { + "epoch": 0.3017, + "grad_norm": 0.9927710316218432, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 30170 + }, + { + "epoch": 0.30171, + "grad_norm": 1.230435455188699, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 30171 + }, + { + "epoch": 0.30172, + "grad_norm": 0.6468184113836593, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 30172 + }, + { + "epoch": 0.30173, + "grad_norm": 0.6914570176515393, + "learning_rate": 0.003, + "loss": 4.033, + "step": 30173 + }, + { + "epoch": 0.30174, + "grad_norm": 0.7440930304004294, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 30174 + }, + { + "epoch": 0.30175, + "grad_norm": 0.7684478765390758, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 30175 + }, + { + "epoch": 0.30176, + "grad_norm": 0.8473078011691901, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 30176 + }, + { + "epoch": 0.30177, + "grad_norm": 0.9998850605661346, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 30177 + }, + { + "epoch": 0.30178, + "grad_norm": 1.191118895961893, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 30178 + }, + { + "epoch": 0.30179, + "grad_norm": 0.9319372224088357, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 30179 + }, + { + "epoch": 0.3018, + "grad_norm": 0.9241402466680498, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 30180 + }, + { + "epoch": 0.30181, + "grad_norm": 1.0082617174558068, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 30181 + }, + { + "epoch": 0.30182, + "grad_norm": 0.9268322466053224, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 30182 + }, + { + "epoch": 0.30183, + "grad_norm": 0.8054305846010326, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 30183 + }, + { + "epoch": 0.30184, + "grad_norm": 0.8776615775735316, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 30184 + }, + { + "epoch": 0.30185, + "grad_norm": 0.8567159995745438, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 30185 + }, + { + "epoch": 0.30186, + "grad_norm": 0.7864827193211007, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 30186 + }, + { + "epoch": 0.30187, + "grad_norm": 0.7074574022526259, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 30187 + }, + { + "epoch": 0.30188, + "grad_norm": 0.8796303585088163, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 30188 + }, + { + "epoch": 0.30189, + "grad_norm": 1.084594708203222, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 30189 + }, + { + "epoch": 0.3019, + "grad_norm": 1.0803267897508766, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 30190 + }, + { + "epoch": 0.30191, + "grad_norm": 1.0133599223194938, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 30191 + }, + { + "epoch": 0.30192, + "grad_norm": 0.9459721852381003, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 30192 + }, + { + "epoch": 0.30193, + "grad_norm": 1.0030039629693577, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 30193 + }, + { + "epoch": 0.30194, + "grad_norm": 0.9647096275252048, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 30194 + }, + { + "epoch": 0.30195, + "grad_norm": 0.9134975042400836, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 30195 + }, + { + "epoch": 0.30196, + "grad_norm": 0.8296765586270167, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 30196 + }, + { + "epoch": 0.30197, + "grad_norm": 0.7268598319664543, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 30197 + }, + { + "epoch": 0.30198, + "grad_norm": 0.7576457102845, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 30198 + }, + { + "epoch": 0.30199, + "grad_norm": 0.7236578858880327, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 30199 + }, + { + "epoch": 0.302, + "grad_norm": 0.7807662772913275, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 30200 + }, + { + "epoch": 0.30201, + "grad_norm": 0.8157959707903321, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 30201 + }, + { + "epoch": 0.30202, + "grad_norm": 0.8565347842947642, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 30202 + }, + { + "epoch": 0.30203, + "grad_norm": 0.8254922441594126, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 30203 + }, + { + "epoch": 0.30204, + "grad_norm": 1.0395414295896102, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 30204 + }, + { + "epoch": 0.30205, + "grad_norm": 1.1346354252092652, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 30205 + }, + { + "epoch": 0.30206, + "grad_norm": 1.0048613163711353, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 30206 + }, + { + "epoch": 0.30207, + "grad_norm": 0.8947779257328616, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 30207 + }, + { + "epoch": 0.30208, + "grad_norm": 0.7789926533717361, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 30208 + }, + { + "epoch": 0.30209, + "grad_norm": 0.6294075332404504, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 30209 + }, + { + "epoch": 0.3021, + "grad_norm": 0.5794887657185035, + "learning_rate": 0.003, + "loss": 4.039, + "step": 30210 + }, + { + "epoch": 0.30211, + "grad_norm": 0.6878575425938104, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 30211 + }, + { + "epoch": 0.30212, + "grad_norm": 0.7904747368103345, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 30212 + }, + { + "epoch": 0.30213, + "grad_norm": 0.8423104315246172, + "learning_rate": 0.003, + "loss": 4.018, + "step": 30213 + }, + { + "epoch": 0.30214, + "grad_norm": 0.9181942290641042, + "learning_rate": 0.003, + "loss": 4.045, + "step": 30214 + }, + { + "epoch": 0.30215, + "grad_norm": 0.9280790786166967, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 30215 + }, + { + "epoch": 0.30216, + "grad_norm": 0.8861612461425382, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 30216 + }, + { + "epoch": 0.30217, + "grad_norm": 0.9560385597711841, + "learning_rate": 0.003, + "loss": 4.026, + "step": 30217 + }, + { + "epoch": 0.30218, + "grad_norm": 0.9675741604916318, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 30218 + }, + { + "epoch": 0.30219, + "grad_norm": 0.829072723069058, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 30219 + }, + { + "epoch": 0.3022, + "grad_norm": 0.6917333307009174, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 30220 + }, + { + "epoch": 0.30221, + "grad_norm": 0.801743159567868, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 30221 + }, + { + "epoch": 0.30222, + "grad_norm": 0.9904544803647845, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 30222 + }, + { + "epoch": 0.30223, + "grad_norm": 1.2567400902459158, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 30223 + }, + { + "epoch": 0.30224, + "grad_norm": 0.6580466809910713, + "learning_rate": 0.003, + "loss": 4.023, + "step": 30224 + }, + { + "epoch": 0.30225, + "grad_norm": 0.6552494141731057, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 30225 + }, + { + "epoch": 0.30226, + "grad_norm": 0.7268933074144787, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 30226 + }, + { + "epoch": 0.30227, + "grad_norm": 0.7533267192358415, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 30227 + }, + { + "epoch": 0.30228, + "grad_norm": 0.8663240625366962, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 30228 + }, + { + "epoch": 0.30229, + "grad_norm": 0.9616932705896533, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 30229 + }, + { + "epoch": 0.3023, + "grad_norm": 0.9851997579955184, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 30230 + }, + { + "epoch": 0.30231, + "grad_norm": 0.8770988413238499, + "learning_rate": 0.003, + "loss": 3.9938, + "step": 30231 + }, + { + "epoch": 0.30232, + "grad_norm": 0.777840995872639, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 30232 + }, + { + "epoch": 0.30233, + "grad_norm": 0.7853122651488179, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 30233 + }, + { + "epoch": 0.30234, + "grad_norm": 0.8677222722718249, + "learning_rate": 0.003, + "loss": 4.061, + "step": 30234 + }, + { + "epoch": 0.30235, + "grad_norm": 0.9088437875486106, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 30235 + }, + { + "epoch": 0.30236, + "grad_norm": 0.9795015108727622, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 30236 + }, + { + "epoch": 0.30237, + "grad_norm": 1.0431021719652733, + "learning_rate": 0.003, + "loss": 4.07, + "step": 30237 + }, + { + "epoch": 0.30238, + "grad_norm": 0.9567088999423756, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 30238 + }, + { + "epoch": 0.30239, + "grad_norm": 0.8719577168680183, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 30239 + }, + { + "epoch": 0.3024, + "grad_norm": 0.8847054409272194, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 30240 + }, + { + "epoch": 0.30241, + "grad_norm": 1.0324779281681393, + "learning_rate": 0.003, + "loss": 4.057, + "step": 30241 + }, + { + "epoch": 0.30242, + "grad_norm": 1.1174717366240166, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 30242 + }, + { + "epoch": 0.30243, + "grad_norm": 0.8548772551325193, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 30243 + }, + { + "epoch": 0.30244, + "grad_norm": 0.8716835927874, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 30244 + }, + { + "epoch": 0.30245, + "grad_norm": 0.8870638937515534, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 30245 + }, + { + "epoch": 0.30246, + "grad_norm": 0.9281777253711491, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 30246 + }, + { + "epoch": 0.30247, + "grad_norm": 1.1480474558640383, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 30247 + }, + { + "epoch": 0.30248, + "grad_norm": 0.8556284493919245, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 30248 + }, + { + "epoch": 0.30249, + "grad_norm": 0.7762358545881688, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 30249 + }, + { + "epoch": 0.3025, + "grad_norm": 0.7889072140931954, + "learning_rate": 0.003, + "loss": 4.031, + "step": 30250 + }, + { + "epoch": 0.30251, + "grad_norm": 0.6634415407956741, + "learning_rate": 0.003, + "loss": 4.022, + "step": 30251 + }, + { + "epoch": 0.30252, + "grad_norm": 0.6716827740077547, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 30252 + }, + { + "epoch": 0.30253, + "grad_norm": 0.6564279248289177, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 30253 + }, + { + "epoch": 0.30254, + "grad_norm": 0.6843073766519795, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 30254 + }, + { + "epoch": 0.30255, + "grad_norm": 0.9708243470474383, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 30255 + }, + { + "epoch": 0.30256, + "grad_norm": 1.1856892877141294, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 30256 + }, + { + "epoch": 0.30257, + "grad_norm": 0.7047268858673291, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 30257 + }, + { + "epoch": 0.30258, + "grad_norm": 0.6217357136422378, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 30258 + }, + { + "epoch": 0.30259, + "grad_norm": 0.6661141812941753, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 30259 + }, + { + "epoch": 0.3026, + "grad_norm": 0.8288771426003788, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 30260 + }, + { + "epoch": 0.30261, + "grad_norm": 0.9389334705134942, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 30261 + }, + { + "epoch": 0.30262, + "grad_norm": 0.8923514026387495, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 30262 + }, + { + "epoch": 0.30263, + "grad_norm": 0.8378552060658956, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 30263 + }, + { + "epoch": 0.30264, + "grad_norm": 0.8286896357732173, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 30264 + }, + { + "epoch": 0.30265, + "grad_norm": 0.7736714578059609, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 30265 + }, + { + "epoch": 0.30266, + "grad_norm": 0.7027885308545315, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 30266 + }, + { + "epoch": 0.30267, + "grad_norm": 0.7572522819183456, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 30267 + }, + { + "epoch": 0.30268, + "grad_norm": 0.8583655452138667, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 30268 + }, + { + "epoch": 0.30269, + "grad_norm": 0.9257866844793068, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 30269 + }, + { + "epoch": 0.3027, + "grad_norm": 0.9587327236254054, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 30270 + }, + { + "epoch": 0.30271, + "grad_norm": 0.9973885176791025, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 30271 + }, + { + "epoch": 0.30272, + "grad_norm": 1.1154447798842144, + "learning_rate": 0.003, + "loss": 4.095, + "step": 30272 + }, + { + "epoch": 0.30273, + "grad_norm": 0.8938248111648088, + "learning_rate": 0.003, + "loss": 4.058, + "step": 30273 + }, + { + "epoch": 0.30274, + "grad_norm": 0.7974154927094851, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 30274 + }, + { + "epoch": 0.30275, + "grad_norm": 0.7646063148274, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 30275 + }, + { + "epoch": 0.30276, + "grad_norm": 0.8578301505446296, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 30276 + }, + { + "epoch": 0.30277, + "grad_norm": 0.965487826416326, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 30277 + }, + { + "epoch": 0.30278, + "grad_norm": 0.9761680390046865, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 30278 + }, + { + "epoch": 0.30279, + "grad_norm": 1.015388266586735, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 30279 + }, + { + "epoch": 0.3028, + "grad_norm": 1.0182864075842986, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 30280 + }, + { + "epoch": 0.30281, + "grad_norm": 0.9425172693851305, + "learning_rate": 0.003, + "loss": 4.0006, + "step": 30281 + }, + { + "epoch": 0.30282, + "grad_norm": 0.8810106789198481, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 30282 + }, + { + "epoch": 0.30283, + "grad_norm": 0.8125669325110937, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 30283 + }, + { + "epoch": 0.30284, + "grad_norm": 0.7198831490635238, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 30284 + }, + { + "epoch": 0.30285, + "grad_norm": 0.7296678374144023, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 30285 + }, + { + "epoch": 0.30286, + "grad_norm": 0.751971281174092, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 30286 + }, + { + "epoch": 0.30287, + "grad_norm": 0.7541534009811488, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 30287 + }, + { + "epoch": 0.30288, + "grad_norm": 0.7441026158887752, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 30288 + }, + { + "epoch": 0.30289, + "grad_norm": 0.7320371537104422, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 30289 + }, + { + "epoch": 0.3029, + "grad_norm": 0.8192699362569991, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 30290 + }, + { + "epoch": 0.30291, + "grad_norm": 0.7396393554226357, + "learning_rate": 0.003, + "loss": 4.05, + "step": 30291 + }, + { + "epoch": 0.30292, + "grad_norm": 0.7322871753341583, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 30292 + }, + { + "epoch": 0.30293, + "grad_norm": 0.7552806767362475, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 30293 + }, + { + "epoch": 0.30294, + "grad_norm": 0.8307313136559434, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 30294 + }, + { + "epoch": 0.30295, + "grad_norm": 0.8115439524067782, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 30295 + }, + { + "epoch": 0.30296, + "grad_norm": 0.8244297448028151, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 30296 + }, + { + "epoch": 0.30297, + "grad_norm": 0.7328666570988653, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 30297 + }, + { + "epoch": 0.30298, + "grad_norm": 0.6955333035293306, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 30298 + }, + { + "epoch": 0.30299, + "grad_norm": 0.6730751202375586, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 30299 + }, + { + "epoch": 0.303, + "grad_norm": 0.7484661433762339, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 30300 + }, + { + "epoch": 0.30301, + "grad_norm": 0.8594429525780563, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 30301 + }, + { + "epoch": 0.30302, + "grad_norm": 0.9226836181777879, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 30302 + }, + { + "epoch": 0.30303, + "grad_norm": 1.0218726866250025, + "learning_rate": 0.003, + "loss": 4.096, + "step": 30303 + }, + { + "epoch": 0.30304, + "grad_norm": 1.0513759282274067, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 30304 + }, + { + "epoch": 0.30305, + "grad_norm": 0.8383673264780788, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 30305 + }, + { + "epoch": 0.30306, + "grad_norm": 0.8479729082374823, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 30306 + }, + { + "epoch": 0.30307, + "grad_norm": 0.8705096203591386, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 30307 + }, + { + "epoch": 0.30308, + "grad_norm": 1.0320537142735033, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 30308 + }, + { + "epoch": 0.30309, + "grad_norm": 1.1949871638826641, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 30309 + }, + { + "epoch": 0.3031, + "grad_norm": 0.9663187034316818, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 30310 + }, + { + "epoch": 0.30311, + "grad_norm": 0.9761822857109628, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 30311 + }, + { + "epoch": 0.30312, + "grad_norm": 0.9837425009489806, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 30312 + }, + { + "epoch": 0.30313, + "grad_norm": 0.9024120654314449, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 30313 + }, + { + "epoch": 0.30314, + "grad_norm": 0.8674094187069319, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 30314 + }, + { + "epoch": 0.30315, + "grad_norm": 0.9041556031086786, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 30315 + }, + { + "epoch": 0.30316, + "grad_norm": 0.9394311657478577, + "learning_rate": 0.003, + "loss": 4.039, + "step": 30316 + }, + { + "epoch": 0.30317, + "grad_norm": 0.9662805251903689, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 30317 + }, + { + "epoch": 0.30318, + "grad_norm": 1.0070809227896294, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 30318 + }, + { + "epoch": 0.30319, + "grad_norm": 0.9520665896256096, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 30319 + }, + { + "epoch": 0.3032, + "grad_norm": 0.8791330806702093, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 30320 + }, + { + "epoch": 0.30321, + "grad_norm": 0.8275199618899908, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 30321 + }, + { + "epoch": 0.30322, + "grad_norm": 0.9259135899100793, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 30322 + }, + { + "epoch": 0.30323, + "grad_norm": 0.8930378482409589, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 30323 + }, + { + "epoch": 0.30324, + "grad_norm": 0.9640065182701385, + "learning_rate": 0.003, + "loss": 4.053, + "step": 30324 + }, + { + "epoch": 0.30325, + "grad_norm": 0.9355570709520026, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 30325 + }, + { + "epoch": 0.30326, + "grad_norm": 0.9451752486195746, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 30326 + }, + { + "epoch": 0.30327, + "grad_norm": 0.8444279845649119, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 30327 + }, + { + "epoch": 0.30328, + "grad_norm": 0.9638581131720523, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 30328 + }, + { + "epoch": 0.30329, + "grad_norm": 1.2164357065826361, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 30329 + }, + { + "epoch": 0.3033, + "grad_norm": 0.8500487588012542, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 30330 + }, + { + "epoch": 0.30331, + "grad_norm": 0.6809214158237236, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 30331 + }, + { + "epoch": 0.30332, + "grad_norm": 0.6622067608266444, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 30332 + }, + { + "epoch": 0.30333, + "grad_norm": 0.5439625163689614, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 30333 + }, + { + "epoch": 0.30334, + "grad_norm": 0.5104067080174235, + "learning_rate": 0.003, + "loss": 4.042, + "step": 30334 + }, + { + "epoch": 0.30335, + "grad_norm": 0.536076715743425, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 30335 + }, + { + "epoch": 0.30336, + "grad_norm": 0.5633306296787329, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 30336 + }, + { + "epoch": 0.30337, + "grad_norm": 0.603095346957842, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 30337 + }, + { + "epoch": 0.30338, + "grad_norm": 0.6593516123939079, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 30338 + }, + { + "epoch": 0.30339, + "grad_norm": 0.6999218461770925, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 30339 + }, + { + "epoch": 0.3034, + "grad_norm": 0.7055835145734309, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 30340 + }, + { + "epoch": 0.30341, + "grad_norm": 0.7743380441808827, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 30341 + }, + { + "epoch": 0.30342, + "grad_norm": 0.8022722102790272, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 30342 + }, + { + "epoch": 0.30343, + "grad_norm": 0.8427322854885158, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 30343 + }, + { + "epoch": 0.30344, + "grad_norm": 1.017967577262336, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 30344 + }, + { + "epoch": 0.30345, + "grad_norm": 1.2565658641051938, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 30345 + }, + { + "epoch": 0.30346, + "grad_norm": 0.8434448705219795, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 30346 + }, + { + "epoch": 0.30347, + "grad_norm": 0.7778339357991646, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 30347 + }, + { + "epoch": 0.30348, + "grad_norm": 0.7547255526072627, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 30348 + }, + { + "epoch": 0.30349, + "grad_norm": 0.6303580063583124, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 30349 + }, + { + "epoch": 0.3035, + "grad_norm": 0.6978509716572284, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 30350 + }, + { + "epoch": 0.30351, + "grad_norm": 0.7036367152057607, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 30351 + }, + { + "epoch": 0.30352, + "grad_norm": 0.8213754893843945, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 30352 + }, + { + "epoch": 0.30353, + "grad_norm": 0.8639831488103732, + "learning_rate": 0.003, + "loss": 4.0029, + "step": 30353 + }, + { + "epoch": 0.30354, + "grad_norm": 0.9723515904276983, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 30354 + }, + { + "epoch": 0.30355, + "grad_norm": 1.1624190832534276, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 30355 + }, + { + "epoch": 0.30356, + "grad_norm": 0.9977567528210546, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 30356 + }, + { + "epoch": 0.30357, + "grad_norm": 0.9478570711543748, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 30357 + }, + { + "epoch": 0.30358, + "grad_norm": 0.9258046695519578, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 30358 + }, + { + "epoch": 0.30359, + "grad_norm": 0.8180790303742542, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 30359 + }, + { + "epoch": 0.3036, + "grad_norm": 0.8786888998024635, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 30360 + }, + { + "epoch": 0.30361, + "grad_norm": 0.8983794293591455, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 30361 + }, + { + "epoch": 0.30362, + "grad_norm": 0.812332159574046, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 30362 + }, + { + "epoch": 0.30363, + "grad_norm": 0.6745126366424232, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 30363 + }, + { + "epoch": 0.30364, + "grad_norm": 0.6277426253225633, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 30364 + }, + { + "epoch": 0.30365, + "grad_norm": 0.7692258225181239, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 30365 + }, + { + "epoch": 0.30366, + "grad_norm": 0.9150954907539325, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 30366 + }, + { + "epoch": 0.30367, + "grad_norm": 1.0885310718077903, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 30367 + }, + { + "epoch": 0.30368, + "grad_norm": 1.154268162036633, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 30368 + }, + { + "epoch": 0.30369, + "grad_norm": 0.7038154395605817, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 30369 + }, + { + "epoch": 0.3037, + "grad_norm": 0.6402108328496792, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 30370 + }, + { + "epoch": 0.30371, + "grad_norm": 0.852178766271217, + "learning_rate": 0.003, + "loss": 3.9927, + "step": 30371 + }, + { + "epoch": 0.30372, + "grad_norm": 1.0335590463761, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 30372 + }, + { + "epoch": 0.30373, + "grad_norm": 1.0331789257182817, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 30373 + }, + { + "epoch": 0.30374, + "grad_norm": 0.9043022035134948, + "learning_rate": 0.003, + "loss": 3.9993, + "step": 30374 + }, + { + "epoch": 0.30375, + "grad_norm": 0.8487205741784615, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 30375 + }, + { + "epoch": 0.30376, + "grad_norm": 0.808127922579622, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 30376 + }, + { + "epoch": 0.30377, + "grad_norm": 0.8043081446236571, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 30377 + }, + { + "epoch": 0.30378, + "grad_norm": 0.7623884755492006, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 30378 + }, + { + "epoch": 0.30379, + "grad_norm": 0.7547818540906263, + "learning_rate": 0.003, + "loss": 4.041, + "step": 30379 + }, + { + "epoch": 0.3038, + "grad_norm": 0.7133047006229598, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 30380 + }, + { + "epoch": 0.30381, + "grad_norm": 0.7456587987280763, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 30381 + }, + { + "epoch": 0.30382, + "grad_norm": 0.8225692378688939, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 30382 + }, + { + "epoch": 0.30383, + "grad_norm": 0.9214230816784313, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 30383 + }, + { + "epoch": 0.30384, + "grad_norm": 1.2489936152749403, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 30384 + }, + { + "epoch": 0.30385, + "grad_norm": 1.0303308440839072, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 30385 + }, + { + "epoch": 0.30386, + "grad_norm": 0.8221298431272507, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 30386 + }, + { + "epoch": 0.30387, + "grad_norm": 0.8047891127843105, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 30387 + }, + { + "epoch": 0.30388, + "grad_norm": 1.0532873735691581, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 30388 + }, + { + "epoch": 0.30389, + "grad_norm": 1.0579235718529443, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 30389 + }, + { + "epoch": 0.3039, + "grad_norm": 0.8540267265579224, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 30390 + }, + { + "epoch": 0.30391, + "grad_norm": 0.709782253769263, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 30391 + }, + { + "epoch": 0.30392, + "grad_norm": 0.7060590662653896, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 30392 + }, + { + "epoch": 0.30393, + "grad_norm": 0.6470973568133837, + "learning_rate": 0.003, + "loss": 4.047, + "step": 30393 + }, + { + "epoch": 0.30394, + "grad_norm": 0.6997078132953025, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 30394 + }, + { + "epoch": 0.30395, + "grad_norm": 0.7534929531287325, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 30395 + }, + { + "epoch": 0.30396, + "grad_norm": 0.8247698008104903, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 30396 + }, + { + "epoch": 0.30397, + "grad_norm": 0.934412112346228, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 30397 + }, + { + "epoch": 0.30398, + "grad_norm": 0.9171367338808356, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 30398 + }, + { + "epoch": 0.30399, + "grad_norm": 0.9178614635710145, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 30399 + }, + { + "epoch": 0.304, + "grad_norm": 0.8361821417284744, + "learning_rate": 0.003, + "loss": 4.031, + "step": 30400 + }, + { + "epoch": 0.30401, + "grad_norm": 0.7622378866193045, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 30401 + }, + { + "epoch": 0.30402, + "grad_norm": 0.8011949664231661, + "learning_rate": 0.003, + "loss": 4.047, + "step": 30402 + }, + { + "epoch": 0.30403, + "grad_norm": 0.6668999293878359, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 30403 + }, + { + "epoch": 0.30404, + "grad_norm": 0.5943269098529899, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 30404 + }, + { + "epoch": 0.30405, + "grad_norm": 0.6021651981630596, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 30405 + }, + { + "epoch": 0.30406, + "grad_norm": 0.5703754902250703, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 30406 + }, + { + "epoch": 0.30407, + "grad_norm": 0.6541869662720878, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 30407 + }, + { + "epoch": 0.30408, + "grad_norm": 0.9105963388081413, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 30408 + }, + { + "epoch": 0.30409, + "grad_norm": 1.343175281846438, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 30409 + }, + { + "epoch": 0.3041, + "grad_norm": 0.827795638475697, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 30410 + }, + { + "epoch": 0.30411, + "grad_norm": 0.6956455471378123, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 30411 + }, + { + "epoch": 0.30412, + "grad_norm": 0.8203770484420269, + "learning_rate": 0.003, + "loss": 4.042, + "step": 30412 + }, + { + "epoch": 0.30413, + "grad_norm": 0.7906248366577798, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 30413 + }, + { + "epoch": 0.30414, + "grad_norm": 0.8718052118562648, + "learning_rate": 0.003, + "loss": 4.043, + "step": 30414 + }, + { + "epoch": 0.30415, + "grad_norm": 0.8933365564966166, + "learning_rate": 0.003, + "loss": 4.045, + "step": 30415 + }, + { + "epoch": 0.30416, + "grad_norm": 0.8904494742186979, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 30416 + }, + { + "epoch": 0.30417, + "grad_norm": 1.06988936338908, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 30417 + }, + { + "epoch": 0.30418, + "grad_norm": 1.0233534495781111, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 30418 + }, + { + "epoch": 0.30419, + "grad_norm": 1.0859681486277621, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 30419 + }, + { + "epoch": 0.3042, + "grad_norm": 1.0611062573736434, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 30420 + }, + { + "epoch": 0.30421, + "grad_norm": 0.9130582343278664, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 30421 + }, + { + "epoch": 0.30422, + "grad_norm": 0.949190403199203, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 30422 + }, + { + "epoch": 0.30423, + "grad_norm": 1.0452269589112806, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 30423 + }, + { + "epoch": 0.30424, + "grad_norm": 1.1601117408552906, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 30424 + }, + { + "epoch": 0.30425, + "grad_norm": 1.2062509466885893, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 30425 + }, + { + "epoch": 0.30426, + "grad_norm": 0.7633774157556034, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 30426 + }, + { + "epoch": 0.30427, + "grad_norm": 0.6648448145524332, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 30427 + }, + { + "epoch": 0.30428, + "grad_norm": 0.6529134540603916, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 30428 + }, + { + "epoch": 0.30429, + "grad_norm": 0.7048279810122813, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 30429 + }, + { + "epoch": 0.3043, + "grad_norm": 0.7955976938689573, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 30430 + }, + { + "epoch": 0.30431, + "grad_norm": 0.9720577874014509, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 30431 + }, + { + "epoch": 0.30432, + "grad_norm": 1.1268461390707243, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 30432 + }, + { + "epoch": 0.30433, + "grad_norm": 0.7636657829703977, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 30433 + }, + { + "epoch": 0.30434, + "grad_norm": 0.7030947605766785, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 30434 + }, + { + "epoch": 0.30435, + "grad_norm": 0.5670352688566129, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 30435 + }, + { + "epoch": 0.30436, + "grad_norm": 0.6837463365073297, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 30436 + }, + { + "epoch": 0.30437, + "grad_norm": 0.7911284538229383, + "learning_rate": 0.003, + "loss": 4.027, + "step": 30437 + }, + { + "epoch": 0.30438, + "grad_norm": 0.8024377689290391, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 30438 + }, + { + "epoch": 0.30439, + "grad_norm": 0.7336471641850336, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 30439 + }, + { + "epoch": 0.3044, + "grad_norm": 0.8031943414482025, + "learning_rate": 0.003, + "loss": 3.9853, + "step": 30440 + }, + { + "epoch": 0.30441, + "grad_norm": 0.9310863609266561, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 30441 + }, + { + "epoch": 0.30442, + "grad_norm": 1.084328173625542, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 30442 + }, + { + "epoch": 0.30443, + "grad_norm": 0.81091650004587, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 30443 + }, + { + "epoch": 0.30444, + "grad_norm": 0.7283605198772602, + "learning_rate": 0.003, + "loss": 4.049, + "step": 30444 + }, + { + "epoch": 0.30445, + "grad_norm": 0.726949573576708, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 30445 + }, + { + "epoch": 0.30446, + "grad_norm": 0.7050301561999607, + "learning_rate": 0.003, + "loss": 4.0043, + "step": 30446 + }, + { + "epoch": 0.30447, + "grad_norm": 0.6356960710954501, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 30447 + }, + { + "epoch": 0.30448, + "grad_norm": 0.7117935539646044, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 30448 + }, + { + "epoch": 0.30449, + "grad_norm": 0.6701131598886387, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 30449 + }, + { + "epoch": 0.3045, + "grad_norm": 0.6896217900115332, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 30450 + }, + { + "epoch": 0.30451, + "grad_norm": 0.851917201949667, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 30451 + }, + { + "epoch": 0.30452, + "grad_norm": 1.0226558531052654, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 30452 + }, + { + "epoch": 0.30453, + "grad_norm": 1.030817193177102, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 30453 + }, + { + "epoch": 0.30454, + "grad_norm": 1.0778862701257226, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 30454 + }, + { + "epoch": 0.30455, + "grad_norm": 0.9012286792449349, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 30455 + }, + { + "epoch": 0.30456, + "grad_norm": 0.8644815132532928, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 30456 + }, + { + "epoch": 0.30457, + "grad_norm": 0.8638120009318355, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 30457 + }, + { + "epoch": 0.30458, + "grad_norm": 1.02810107246511, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 30458 + }, + { + "epoch": 0.30459, + "grad_norm": 1.121234329621712, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 30459 + }, + { + "epoch": 0.3046, + "grad_norm": 0.8549847885210355, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 30460 + }, + { + "epoch": 0.30461, + "grad_norm": 0.8971993868883004, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 30461 + }, + { + "epoch": 0.30462, + "grad_norm": 0.8394039121803021, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 30462 + }, + { + "epoch": 0.30463, + "grad_norm": 0.8460804544011828, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 30463 + }, + { + "epoch": 0.30464, + "grad_norm": 0.848653454075666, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 30464 + }, + { + "epoch": 0.30465, + "grad_norm": 0.9782271380574802, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 30465 + }, + { + "epoch": 0.30466, + "grad_norm": 1.0659785694179789, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 30466 + }, + { + "epoch": 0.30467, + "grad_norm": 1.0248519914732717, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 30467 + }, + { + "epoch": 0.30468, + "grad_norm": 1.1156964821586373, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 30468 + }, + { + "epoch": 0.30469, + "grad_norm": 0.8743157369442729, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 30469 + }, + { + "epoch": 0.3047, + "grad_norm": 0.7959386058210299, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 30470 + }, + { + "epoch": 0.30471, + "grad_norm": 0.862088085197215, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 30471 + }, + { + "epoch": 0.30472, + "grad_norm": 0.8846574972125563, + "learning_rate": 0.003, + "loss": 4.065, + "step": 30472 + }, + { + "epoch": 0.30473, + "grad_norm": 0.8483636677170218, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 30473 + }, + { + "epoch": 0.30474, + "grad_norm": 0.8817447383655752, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 30474 + }, + { + "epoch": 0.30475, + "grad_norm": 0.9966391894746914, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 30475 + }, + { + "epoch": 0.30476, + "grad_norm": 1.1962880796765865, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 30476 + }, + { + "epoch": 0.30477, + "grad_norm": 0.9625264371357545, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 30477 + }, + { + "epoch": 0.30478, + "grad_norm": 0.9750212509489287, + "learning_rate": 0.003, + "loss": 4.077, + "step": 30478 + }, + { + "epoch": 0.30479, + "grad_norm": 0.9481598119386552, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 30479 + }, + { + "epoch": 0.3048, + "grad_norm": 0.7988261716212065, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 30480 + }, + { + "epoch": 0.30481, + "grad_norm": 0.847550955821006, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 30481 + }, + { + "epoch": 0.30482, + "grad_norm": 0.8917790063521829, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 30482 + }, + { + "epoch": 0.30483, + "grad_norm": 0.7224393477258957, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 30483 + }, + { + "epoch": 0.30484, + "grad_norm": 0.700643387335102, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 30484 + }, + { + "epoch": 0.30485, + "grad_norm": 0.7039354099915305, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 30485 + }, + { + "epoch": 0.30486, + "grad_norm": 0.815230929702849, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 30486 + }, + { + "epoch": 0.30487, + "grad_norm": 1.092441901058937, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 30487 + }, + { + "epoch": 0.30488, + "grad_norm": 1.0436284620564016, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 30488 + }, + { + "epoch": 0.30489, + "grad_norm": 1.0122071127388763, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 30489 + }, + { + "epoch": 0.3049, + "grad_norm": 1.071092342491752, + "learning_rate": 0.003, + "loss": 4.074, + "step": 30490 + }, + { + "epoch": 0.30491, + "grad_norm": 0.8436317067588053, + "learning_rate": 0.003, + "loss": 4.0076, + "step": 30491 + }, + { + "epoch": 0.30492, + "grad_norm": 0.7614694409654934, + "learning_rate": 0.003, + "loss": 4.045, + "step": 30492 + }, + { + "epoch": 0.30493, + "grad_norm": 0.6153804046136563, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 30493 + }, + { + "epoch": 0.30494, + "grad_norm": 0.6106193435479481, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 30494 + }, + { + "epoch": 0.30495, + "grad_norm": 0.5330262898240072, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 30495 + }, + { + "epoch": 0.30496, + "grad_norm": 0.5738942532228786, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 30496 + }, + { + "epoch": 0.30497, + "grad_norm": 0.575719497939543, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 30497 + }, + { + "epoch": 0.30498, + "grad_norm": 0.6417501536600894, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 30498 + }, + { + "epoch": 0.30499, + "grad_norm": 0.65048652159219, + "learning_rate": 0.003, + "loss": 3.9942, + "step": 30499 + }, + { + "epoch": 0.305, + "grad_norm": 0.6868841833889263, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 30500 + }, + { + "epoch": 0.30501, + "grad_norm": 0.8744731226918196, + "learning_rate": 0.003, + "loss": 4.013, + "step": 30501 + }, + { + "epoch": 0.30502, + "grad_norm": 1.1334367837373611, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 30502 + }, + { + "epoch": 0.30503, + "grad_norm": 0.8577381633053462, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 30503 + }, + { + "epoch": 0.30504, + "grad_norm": 0.6758927448067685, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 30504 + }, + { + "epoch": 0.30505, + "grad_norm": 0.8543288397328942, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 30505 + }, + { + "epoch": 0.30506, + "grad_norm": 0.9280161416946323, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 30506 + }, + { + "epoch": 0.30507, + "grad_norm": 0.8625977229270854, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 30507 + }, + { + "epoch": 0.30508, + "grad_norm": 0.7481427245766996, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 30508 + }, + { + "epoch": 0.30509, + "grad_norm": 0.7460777333936117, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 30509 + }, + { + "epoch": 0.3051, + "grad_norm": 0.8450310358772559, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 30510 + }, + { + "epoch": 0.30511, + "grad_norm": 0.8982895605793537, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 30511 + }, + { + "epoch": 0.30512, + "grad_norm": 1.0241248435221675, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 30512 + }, + { + "epoch": 0.30513, + "grad_norm": 0.9244849904760327, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 30513 + }, + { + "epoch": 0.30514, + "grad_norm": 0.8647594021727987, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 30514 + }, + { + "epoch": 0.30515, + "grad_norm": 0.8321297948978755, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 30515 + }, + { + "epoch": 0.30516, + "grad_norm": 0.7647133873584363, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 30516 + }, + { + "epoch": 0.30517, + "grad_norm": 0.6969550977042609, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 30517 + }, + { + "epoch": 0.30518, + "grad_norm": 0.7121579713015437, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 30518 + }, + { + "epoch": 0.30519, + "grad_norm": 0.7030654037798773, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 30519 + }, + { + "epoch": 0.3052, + "grad_norm": 0.7175191410393085, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 30520 + }, + { + "epoch": 0.30521, + "grad_norm": 0.9671655814895952, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 30521 + }, + { + "epoch": 0.30522, + "grad_norm": 1.094423253419572, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 30522 + }, + { + "epoch": 0.30523, + "grad_norm": 0.9172453916709014, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 30523 + }, + { + "epoch": 0.30524, + "grad_norm": 0.9449523071791175, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 30524 + }, + { + "epoch": 0.30525, + "grad_norm": 1.1251277533478277, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 30525 + }, + { + "epoch": 0.30526, + "grad_norm": 1.11090565433013, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 30526 + }, + { + "epoch": 0.30527, + "grad_norm": 1.2751994365432053, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 30527 + }, + { + "epoch": 0.30528, + "grad_norm": 1.0351062424991189, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 30528 + }, + { + "epoch": 0.30529, + "grad_norm": 0.9795659440229405, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 30529 + }, + { + "epoch": 0.3053, + "grad_norm": 1.0004309652448256, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 30530 + }, + { + "epoch": 0.30531, + "grad_norm": 0.9577289598588042, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 30531 + }, + { + "epoch": 0.30532, + "grad_norm": 1.0858908739094182, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 30532 + }, + { + "epoch": 0.30533, + "grad_norm": 1.0028607330201866, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 30533 + }, + { + "epoch": 0.30534, + "grad_norm": 1.010395079641831, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 30534 + }, + { + "epoch": 0.30535, + "grad_norm": 1.0323193791886558, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 30535 + }, + { + "epoch": 0.30536, + "grad_norm": 1.11833446147928, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 30536 + }, + { + "epoch": 0.30537, + "grad_norm": 1.0003198632891857, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 30537 + }, + { + "epoch": 0.30538, + "grad_norm": 1.0114548478464949, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 30538 + }, + { + "epoch": 0.30539, + "grad_norm": 0.935820896496027, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 30539 + }, + { + "epoch": 0.3054, + "grad_norm": 0.8795592934592581, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 30540 + }, + { + "epoch": 0.30541, + "grad_norm": 0.7636974905220982, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 30541 + }, + { + "epoch": 0.30542, + "grad_norm": 0.7754541921729162, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 30542 + }, + { + "epoch": 0.30543, + "grad_norm": 0.689826645480617, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 30543 + }, + { + "epoch": 0.30544, + "grad_norm": 0.6819288447751873, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 30544 + }, + { + "epoch": 0.30545, + "grad_norm": 0.7125149965574896, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 30545 + }, + { + "epoch": 0.30546, + "grad_norm": 0.9091752497513007, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 30546 + }, + { + "epoch": 0.30547, + "grad_norm": 0.9367698536764194, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 30547 + }, + { + "epoch": 0.30548, + "grad_norm": 0.7897532811621403, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 30548 + }, + { + "epoch": 0.30549, + "grad_norm": 0.8530441336889497, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 30549 + }, + { + "epoch": 0.3055, + "grad_norm": 1.0069920211125059, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 30550 + }, + { + "epoch": 0.30551, + "grad_norm": 1.0644644544064013, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 30551 + }, + { + "epoch": 0.30552, + "grad_norm": 0.8860204378475169, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 30552 + }, + { + "epoch": 0.30553, + "grad_norm": 0.8583705941414066, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 30553 + }, + { + "epoch": 0.30554, + "grad_norm": 0.7766281771727838, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 30554 + }, + { + "epoch": 0.30555, + "grad_norm": 0.7059658486029725, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 30555 + }, + { + "epoch": 0.30556, + "grad_norm": 0.8801782763900726, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 30556 + }, + { + "epoch": 0.30557, + "grad_norm": 1.170160288310564, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 30557 + }, + { + "epoch": 0.30558, + "grad_norm": 0.9929777763212249, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 30558 + }, + { + "epoch": 0.30559, + "grad_norm": 0.9514168585274976, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 30559 + }, + { + "epoch": 0.3056, + "grad_norm": 1.0573484859536102, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 30560 + }, + { + "epoch": 0.30561, + "grad_norm": 1.1638415987963802, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 30561 + }, + { + "epoch": 0.30562, + "grad_norm": 0.7856256036015478, + "learning_rate": 0.003, + "loss": 4.077, + "step": 30562 + }, + { + "epoch": 0.30563, + "grad_norm": 0.7992188013287786, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 30563 + }, + { + "epoch": 0.30564, + "grad_norm": 0.9011935903417795, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 30564 + }, + { + "epoch": 0.30565, + "grad_norm": 0.9613462140651601, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 30565 + }, + { + "epoch": 0.30566, + "grad_norm": 1.1770448830485234, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 30566 + }, + { + "epoch": 0.30567, + "grad_norm": 0.9866294548294452, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 30567 + }, + { + "epoch": 0.30568, + "grad_norm": 0.8587004059062714, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 30568 + }, + { + "epoch": 0.30569, + "grad_norm": 0.7460376820100256, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 30569 + }, + { + "epoch": 0.3057, + "grad_norm": 0.8152461645244726, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 30570 + }, + { + "epoch": 0.30571, + "grad_norm": 0.6711375809235586, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 30571 + }, + { + "epoch": 0.30572, + "grad_norm": 0.5792487971960065, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 30572 + }, + { + "epoch": 0.30573, + "grad_norm": 0.567052993699175, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 30573 + }, + { + "epoch": 0.30574, + "grad_norm": 0.527951144700206, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 30574 + }, + { + "epoch": 0.30575, + "grad_norm": 0.5718809410268307, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 30575 + }, + { + "epoch": 0.30576, + "grad_norm": 0.7378430186273189, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 30576 + }, + { + "epoch": 0.30577, + "grad_norm": 0.912170837473285, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 30577 + }, + { + "epoch": 0.30578, + "grad_norm": 1.053663314316171, + "learning_rate": 0.003, + "loss": 4.041, + "step": 30578 + }, + { + "epoch": 0.30579, + "grad_norm": 1.0188598772298014, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 30579 + }, + { + "epoch": 0.3058, + "grad_norm": 0.7933571510957059, + "learning_rate": 0.003, + "loss": 4.045, + "step": 30580 + }, + { + "epoch": 0.30581, + "grad_norm": 0.7423629803025442, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 30581 + }, + { + "epoch": 0.30582, + "grad_norm": 0.9288754452642052, + "learning_rate": 0.003, + "loss": 4.048, + "step": 30582 + }, + { + "epoch": 0.30583, + "grad_norm": 0.9870822357771121, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 30583 + }, + { + "epoch": 0.30584, + "grad_norm": 0.9460554914760135, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 30584 + }, + { + "epoch": 0.30585, + "grad_norm": 0.8507144445210616, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 30585 + }, + { + "epoch": 0.30586, + "grad_norm": 0.808819355522907, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 30586 + }, + { + "epoch": 0.30587, + "grad_norm": 0.7880603756458274, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 30587 + }, + { + "epoch": 0.30588, + "grad_norm": 0.7425789147273255, + "learning_rate": 0.003, + "loss": 4.05, + "step": 30588 + }, + { + "epoch": 0.30589, + "grad_norm": 0.7452021928875582, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 30589 + }, + { + "epoch": 0.3059, + "grad_norm": 0.736722001570775, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 30590 + }, + { + "epoch": 0.30591, + "grad_norm": 0.6768935947292244, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 30591 + }, + { + "epoch": 0.30592, + "grad_norm": 0.7349154221715212, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 30592 + }, + { + "epoch": 0.30593, + "grad_norm": 0.812926759726785, + "learning_rate": 0.003, + "loss": 4.027, + "step": 30593 + }, + { + "epoch": 0.30594, + "grad_norm": 0.9142196346174669, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 30594 + }, + { + "epoch": 0.30595, + "grad_norm": 0.9459242455123975, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 30595 + }, + { + "epoch": 0.30596, + "grad_norm": 0.9603696829288325, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 30596 + }, + { + "epoch": 0.30597, + "grad_norm": 0.9464740078390893, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 30597 + }, + { + "epoch": 0.30598, + "grad_norm": 0.7345739115336234, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 30598 + }, + { + "epoch": 0.30599, + "grad_norm": 0.6468237801965143, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 30599 + }, + { + "epoch": 0.306, + "grad_norm": 0.5937269127504389, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 30600 + }, + { + "epoch": 0.30601, + "grad_norm": 0.5489578885187612, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 30601 + }, + { + "epoch": 0.30602, + "grad_norm": 0.5807362357943842, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 30602 + }, + { + "epoch": 0.30603, + "grad_norm": 0.682308873246767, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 30603 + }, + { + "epoch": 0.30604, + "grad_norm": 0.8225353030731932, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 30604 + }, + { + "epoch": 0.30605, + "grad_norm": 0.9959542909348267, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 30605 + }, + { + "epoch": 0.30606, + "grad_norm": 1.0820119134820312, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 30606 + }, + { + "epoch": 0.30607, + "grad_norm": 0.7269649660820245, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 30607 + }, + { + "epoch": 0.30608, + "grad_norm": 0.5665326210790931, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 30608 + }, + { + "epoch": 0.30609, + "grad_norm": 0.6946651238918355, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 30609 + }, + { + "epoch": 0.3061, + "grad_norm": 0.7798391941355483, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 30610 + }, + { + "epoch": 0.30611, + "grad_norm": 0.7308736040999012, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 30611 + }, + { + "epoch": 0.30612, + "grad_norm": 0.7139268096499337, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 30612 + }, + { + "epoch": 0.30613, + "grad_norm": 0.9613536881897801, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 30613 + }, + { + "epoch": 0.30614, + "grad_norm": 1.2135614617239538, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 30614 + }, + { + "epoch": 0.30615, + "grad_norm": 0.8008531705751949, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 30615 + }, + { + "epoch": 0.30616, + "grad_norm": 0.6638249458354882, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 30616 + }, + { + "epoch": 0.30617, + "grad_norm": 0.6825650409506161, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 30617 + }, + { + "epoch": 0.30618, + "grad_norm": 0.6608174467119787, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 30618 + }, + { + "epoch": 0.30619, + "grad_norm": 0.7136153781779269, + "learning_rate": 0.003, + "loss": 4.047, + "step": 30619 + }, + { + "epoch": 0.3062, + "grad_norm": 0.7558530286089896, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 30620 + }, + { + "epoch": 0.30621, + "grad_norm": 1.0218703744776076, + "learning_rate": 0.003, + "loss": 4.037, + "step": 30621 + }, + { + "epoch": 0.30622, + "grad_norm": 1.282246599701646, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 30622 + }, + { + "epoch": 0.30623, + "grad_norm": 0.8571704376223703, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 30623 + }, + { + "epoch": 0.30624, + "grad_norm": 0.7082722612587846, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 30624 + }, + { + "epoch": 0.30625, + "grad_norm": 0.56463135643707, + "learning_rate": 0.003, + "loss": 4.0034, + "step": 30625 + }, + { + "epoch": 0.30626, + "grad_norm": 0.58473763401941, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 30626 + }, + { + "epoch": 0.30627, + "grad_norm": 0.6657359129845389, + "learning_rate": 0.003, + "loss": 4.037, + "step": 30627 + }, + { + "epoch": 0.30628, + "grad_norm": 0.7879959047768911, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 30628 + }, + { + "epoch": 0.30629, + "grad_norm": 0.752512042438393, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 30629 + }, + { + "epoch": 0.3063, + "grad_norm": 0.6755480612240439, + "learning_rate": 0.003, + "loss": 4.053, + "step": 30630 + }, + { + "epoch": 0.30631, + "grad_norm": 0.7047429811229652, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 30631 + }, + { + "epoch": 0.30632, + "grad_norm": 0.6944437339468187, + "learning_rate": 0.003, + "loss": 4.048, + "step": 30632 + }, + { + "epoch": 0.30633, + "grad_norm": 0.7891599804467359, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 30633 + }, + { + "epoch": 0.30634, + "grad_norm": 0.9680067253513436, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 30634 + }, + { + "epoch": 0.30635, + "grad_norm": 1.2627476927545154, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 30635 + }, + { + "epoch": 0.30636, + "grad_norm": 0.8720670851829797, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 30636 + }, + { + "epoch": 0.30637, + "grad_norm": 0.750197936028176, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 30637 + }, + { + "epoch": 0.30638, + "grad_norm": 0.7150018146792149, + "learning_rate": 0.003, + "loss": 3.9884, + "step": 30638 + }, + { + "epoch": 0.30639, + "grad_norm": 0.6863645011478116, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 30639 + }, + { + "epoch": 0.3064, + "grad_norm": 0.8151166348577135, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 30640 + }, + { + "epoch": 0.30641, + "grad_norm": 0.8980925804134118, + "learning_rate": 0.003, + "loss": 4.046, + "step": 30641 + }, + { + "epoch": 0.30642, + "grad_norm": 1.09202452975863, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 30642 + }, + { + "epoch": 0.30643, + "grad_norm": 1.1743605930409449, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 30643 + }, + { + "epoch": 0.30644, + "grad_norm": 0.8558510527686386, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 30644 + }, + { + "epoch": 0.30645, + "grad_norm": 0.7707390344297989, + "learning_rate": 0.003, + "loss": 4.027, + "step": 30645 + }, + { + "epoch": 0.30646, + "grad_norm": 0.7947706431714266, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 30646 + }, + { + "epoch": 0.30647, + "grad_norm": 0.8104892367891933, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 30647 + }, + { + "epoch": 0.30648, + "grad_norm": 0.9092938592949533, + "learning_rate": 0.003, + "loss": 4.016, + "step": 30648 + }, + { + "epoch": 0.30649, + "grad_norm": 0.9414620171263496, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 30649 + }, + { + "epoch": 0.3065, + "grad_norm": 0.8766207163948251, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 30650 + }, + { + "epoch": 0.30651, + "grad_norm": 0.6792322753026214, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 30651 + }, + { + "epoch": 0.30652, + "grad_norm": 0.6911356129148062, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 30652 + }, + { + "epoch": 0.30653, + "grad_norm": 0.7638155195380214, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 30653 + }, + { + "epoch": 0.30654, + "grad_norm": 0.8542605001923141, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 30654 + }, + { + "epoch": 0.30655, + "grad_norm": 0.9602365126244305, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 30655 + }, + { + "epoch": 0.30656, + "grad_norm": 1.2354362880311769, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 30656 + }, + { + "epoch": 0.30657, + "grad_norm": 0.7183424225104932, + "learning_rate": 0.003, + "loss": 4.012, + "step": 30657 + }, + { + "epoch": 0.30658, + "grad_norm": 0.6680684768754789, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 30658 + }, + { + "epoch": 0.30659, + "grad_norm": 0.8483015811732115, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 30659 + }, + { + "epoch": 0.3066, + "grad_norm": 0.8684039498961769, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 30660 + }, + { + "epoch": 0.30661, + "grad_norm": 0.923783096800622, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 30661 + }, + { + "epoch": 0.30662, + "grad_norm": 1.0160246191968734, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 30662 + }, + { + "epoch": 0.30663, + "grad_norm": 1.109590049741251, + "learning_rate": 0.003, + "loss": 4.025, + "step": 30663 + }, + { + "epoch": 0.30664, + "grad_norm": 0.8715465770190681, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 30664 + }, + { + "epoch": 0.30665, + "grad_norm": 0.7476768010657511, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 30665 + }, + { + "epoch": 0.30666, + "grad_norm": 0.7515838215983088, + "learning_rate": 0.003, + "loss": 4.032, + "step": 30666 + }, + { + "epoch": 0.30667, + "grad_norm": 0.7117954565992811, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 30667 + }, + { + "epoch": 0.30668, + "grad_norm": 0.6725148966287215, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 30668 + }, + { + "epoch": 0.30669, + "grad_norm": 0.6428016803998156, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 30669 + }, + { + "epoch": 0.3067, + "grad_norm": 0.6881369132104858, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 30670 + }, + { + "epoch": 0.30671, + "grad_norm": 0.8124269866500381, + "learning_rate": 0.003, + "loss": 4.028, + "step": 30671 + }, + { + "epoch": 0.30672, + "grad_norm": 0.8635767436865468, + "learning_rate": 0.003, + "loss": 4.002, + "step": 30672 + }, + { + "epoch": 0.30673, + "grad_norm": 1.0622452012900765, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 30673 + }, + { + "epoch": 0.30674, + "grad_norm": 1.223491916872636, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 30674 + }, + { + "epoch": 0.30675, + "grad_norm": 0.9484656323084134, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 30675 + }, + { + "epoch": 0.30676, + "grad_norm": 0.9658077797568466, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 30676 + }, + { + "epoch": 0.30677, + "grad_norm": 1.0729349158756862, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 30677 + }, + { + "epoch": 0.30678, + "grad_norm": 0.9254564039432714, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 30678 + }, + { + "epoch": 0.30679, + "grad_norm": 0.8677982876488929, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 30679 + }, + { + "epoch": 0.3068, + "grad_norm": 0.872402191526423, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 30680 + }, + { + "epoch": 0.30681, + "grad_norm": 0.9351671989504667, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 30681 + }, + { + "epoch": 0.30682, + "grad_norm": 0.8673203961207866, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 30682 + }, + { + "epoch": 0.30683, + "grad_norm": 0.7703040328833971, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 30683 + }, + { + "epoch": 0.30684, + "grad_norm": 0.8357480239427207, + "learning_rate": 0.003, + "loss": 4.054, + "step": 30684 + }, + { + "epoch": 0.30685, + "grad_norm": 0.8884315351312037, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 30685 + }, + { + "epoch": 0.30686, + "grad_norm": 0.924637440083138, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 30686 + }, + { + "epoch": 0.30687, + "grad_norm": 1.019542626364121, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 30687 + }, + { + "epoch": 0.30688, + "grad_norm": 0.9272995870744947, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 30688 + }, + { + "epoch": 0.30689, + "grad_norm": 0.797097838928584, + "learning_rate": 0.003, + "loss": 4.028, + "step": 30689 + }, + { + "epoch": 0.3069, + "grad_norm": 0.6586977756784121, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 30690 + }, + { + "epoch": 0.30691, + "grad_norm": 0.6648619160547126, + "learning_rate": 0.003, + "loss": 4.007, + "step": 30691 + }, + { + "epoch": 0.30692, + "grad_norm": 0.6907664307163258, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 30692 + }, + { + "epoch": 0.30693, + "grad_norm": 0.7235317717380572, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 30693 + }, + { + "epoch": 0.30694, + "grad_norm": 0.7525606313776578, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 30694 + }, + { + "epoch": 0.30695, + "grad_norm": 0.9072978984072535, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 30695 + }, + { + "epoch": 0.30696, + "grad_norm": 1.0555384750352588, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 30696 + }, + { + "epoch": 0.30697, + "grad_norm": 1.0615189935440474, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 30697 + }, + { + "epoch": 0.30698, + "grad_norm": 0.9906547515860503, + "learning_rate": 0.003, + "loss": 4.042, + "step": 30698 + }, + { + "epoch": 0.30699, + "grad_norm": 1.0762846268094246, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 30699 + }, + { + "epoch": 0.307, + "grad_norm": 0.926707911512607, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 30700 + }, + { + "epoch": 0.30701, + "grad_norm": 0.9633372217034292, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 30701 + }, + { + "epoch": 0.30702, + "grad_norm": 0.8086401603134522, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 30702 + }, + { + "epoch": 0.30703, + "grad_norm": 0.7232553759407089, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 30703 + }, + { + "epoch": 0.30704, + "grad_norm": 0.7200771870020278, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 30704 + }, + { + "epoch": 0.30705, + "grad_norm": 0.7334945392059585, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 30705 + }, + { + "epoch": 0.30706, + "grad_norm": 0.8101202961658248, + "learning_rate": 0.003, + "loss": 4.046, + "step": 30706 + }, + { + "epoch": 0.30707, + "grad_norm": 0.68569279375923, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 30707 + }, + { + "epoch": 0.30708, + "grad_norm": 0.6723718648987741, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 30708 + }, + { + "epoch": 0.30709, + "grad_norm": 0.7331277554464692, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 30709 + }, + { + "epoch": 0.3071, + "grad_norm": 0.9774950553158962, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 30710 + }, + { + "epoch": 0.30711, + "grad_norm": 1.1288102519230838, + "learning_rate": 0.003, + "loss": 4.037, + "step": 30711 + }, + { + "epoch": 0.30712, + "grad_norm": 0.9404214799378747, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 30712 + }, + { + "epoch": 0.30713, + "grad_norm": 0.7677574751436614, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 30713 + }, + { + "epoch": 0.30714, + "grad_norm": 0.6811551460108075, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 30714 + }, + { + "epoch": 0.30715, + "grad_norm": 0.7481709530439791, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 30715 + }, + { + "epoch": 0.30716, + "grad_norm": 0.7657774624992527, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 30716 + }, + { + "epoch": 0.30717, + "grad_norm": 0.79596476756303, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 30717 + }, + { + "epoch": 0.30718, + "grad_norm": 0.8558118615860202, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 30718 + }, + { + "epoch": 0.30719, + "grad_norm": 0.9065773726730117, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 30719 + }, + { + "epoch": 0.3072, + "grad_norm": 0.9957608095537034, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 30720 + }, + { + "epoch": 0.30721, + "grad_norm": 1.0625921967061749, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 30721 + }, + { + "epoch": 0.30722, + "grad_norm": 0.9823392548062775, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 30722 + }, + { + "epoch": 0.30723, + "grad_norm": 1.0091725179097797, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 30723 + }, + { + "epoch": 0.30724, + "grad_norm": 0.9543424714192384, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 30724 + }, + { + "epoch": 0.30725, + "grad_norm": 0.8544466302349751, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 30725 + }, + { + "epoch": 0.30726, + "grad_norm": 0.722953222684606, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 30726 + }, + { + "epoch": 0.30727, + "grad_norm": 0.6733410345085835, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 30727 + }, + { + "epoch": 0.30728, + "grad_norm": 0.6932912273937791, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 30728 + }, + { + "epoch": 0.30729, + "grad_norm": 0.6469664067554134, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 30729 + }, + { + "epoch": 0.3073, + "grad_norm": 0.6276250865015277, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 30730 + }, + { + "epoch": 0.30731, + "grad_norm": 0.7085524018596607, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 30731 + }, + { + "epoch": 0.30732, + "grad_norm": 0.8788837999463663, + "learning_rate": 0.003, + "loss": 3.986, + "step": 30732 + }, + { + "epoch": 0.30733, + "grad_norm": 1.1045865189338693, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 30733 + }, + { + "epoch": 0.30734, + "grad_norm": 1.2239367269734398, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 30734 + }, + { + "epoch": 0.30735, + "grad_norm": 0.9000751469387722, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 30735 + }, + { + "epoch": 0.30736, + "grad_norm": 0.7531563503305398, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 30736 + }, + { + "epoch": 0.30737, + "grad_norm": 0.7578205678049688, + "learning_rate": 0.003, + "loss": 4.036, + "step": 30737 + }, + { + "epoch": 0.30738, + "grad_norm": 0.750360089831253, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 30738 + }, + { + "epoch": 0.30739, + "grad_norm": 0.7990357997325144, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 30739 + }, + { + "epoch": 0.3074, + "grad_norm": 0.6683234160878258, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 30740 + }, + { + "epoch": 0.30741, + "grad_norm": 0.6196475046195603, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 30741 + }, + { + "epoch": 0.30742, + "grad_norm": 0.6338111350587821, + "learning_rate": 0.003, + "loss": 4.039, + "step": 30742 + }, + { + "epoch": 0.30743, + "grad_norm": 0.5894121607969608, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 30743 + }, + { + "epoch": 0.30744, + "grad_norm": 0.5696970641148376, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 30744 + }, + { + "epoch": 0.30745, + "grad_norm": 0.6689444183158609, + "learning_rate": 0.003, + "loss": 4.033, + "step": 30745 + }, + { + "epoch": 0.30746, + "grad_norm": 0.889369189773622, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 30746 + }, + { + "epoch": 0.30747, + "grad_norm": 1.2530287115991772, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 30747 + }, + { + "epoch": 0.30748, + "grad_norm": 0.9262956253487464, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 30748 + }, + { + "epoch": 0.30749, + "grad_norm": 0.9417936702951035, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 30749 + }, + { + "epoch": 0.3075, + "grad_norm": 0.9895502121683721, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 30750 + }, + { + "epoch": 0.30751, + "grad_norm": 1.1743798058987138, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 30751 + }, + { + "epoch": 0.30752, + "grad_norm": 0.8308033324837698, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 30752 + }, + { + "epoch": 0.30753, + "grad_norm": 0.736248693918308, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 30753 + }, + { + "epoch": 0.30754, + "grad_norm": 0.7478044554207499, + "learning_rate": 0.003, + "loss": 4.017, + "step": 30754 + }, + { + "epoch": 0.30755, + "grad_norm": 0.8205637750415998, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 30755 + }, + { + "epoch": 0.30756, + "grad_norm": 0.8808907269838356, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 30756 + }, + { + "epoch": 0.30757, + "grad_norm": 0.9630424453638969, + "learning_rate": 0.003, + "loss": 4.043, + "step": 30757 + }, + { + "epoch": 0.30758, + "grad_norm": 1.0198432688178847, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 30758 + }, + { + "epoch": 0.30759, + "grad_norm": 1.0615540073933272, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 30759 + }, + { + "epoch": 0.3076, + "grad_norm": 0.8878911904851673, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 30760 + }, + { + "epoch": 0.30761, + "grad_norm": 0.8398407204491872, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 30761 + }, + { + "epoch": 0.30762, + "grad_norm": 0.8354126947854734, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 30762 + }, + { + "epoch": 0.30763, + "grad_norm": 0.7706710221221755, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 30763 + }, + { + "epoch": 0.30764, + "grad_norm": 0.7139683242905777, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 30764 + }, + { + "epoch": 0.30765, + "grad_norm": 0.6266968520992895, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 30765 + }, + { + "epoch": 0.30766, + "grad_norm": 0.6850966865394219, + "learning_rate": 0.003, + "loss": 4.03, + "step": 30766 + }, + { + "epoch": 0.30767, + "grad_norm": 0.7739496069807652, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 30767 + }, + { + "epoch": 0.30768, + "grad_norm": 0.9300343281916225, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 30768 + }, + { + "epoch": 0.30769, + "grad_norm": 1.0123155637226164, + "learning_rate": 0.003, + "loss": 4.056, + "step": 30769 + }, + { + "epoch": 0.3077, + "grad_norm": 0.9438090466462085, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 30770 + }, + { + "epoch": 0.30771, + "grad_norm": 0.8929899468730477, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 30771 + }, + { + "epoch": 0.30772, + "grad_norm": 0.9930606707602103, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 30772 + }, + { + "epoch": 0.30773, + "grad_norm": 1.078348769144722, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 30773 + }, + { + "epoch": 0.30774, + "grad_norm": 0.9683700832091975, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 30774 + }, + { + "epoch": 0.30775, + "grad_norm": 0.9851883506879314, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 30775 + }, + { + "epoch": 0.30776, + "grad_norm": 0.9117119240334299, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 30776 + }, + { + "epoch": 0.30777, + "grad_norm": 0.8501132354062436, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 30777 + }, + { + "epoch": 0.30778, + "grad_norm": 0.8039859642464788, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 30778 + }, + { + "epoch": 0.30779, + "grad_norm": 0.7974274445564175, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 30779 + }, + { + "epoch": 0.3078, + "grad_norm": 0.7328806124359419, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 30780 + }, + { + "epoch": 0.30781, + "grad_norm": 0.6843948783087196, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 30781 + }, + { + "epoch": 0.30782, + "grad_norm": 0.7941531535343496, + "learning_rate": 0.003, + "loss": 4.073, + "step": 30782 + }, + { + "epoch": 0.30783, + "grad_norm": 0.821662142286765, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 30783 + }, + { + "epoch": 0.30784, + "grad_norm": 0.8409291472726417, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 30784 + }, + { + "epoch": 0.30785, + "grad_norm": 0.8588215281473421, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 30785 + }, + { + "epoch": 0.30786, + "grad_norm": 0.9470323416410826, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 30786 + }, + { + "epoch": 0.30787, + "grad_norm": 0.9683240303288001, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 30787 + }, + { + "epoch": 0.30788, + "grad_norm": 0.8352878720201966, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 30788 + }, + { + "epoch": 0.30789, + "grad_norm": 0.8143190605352566, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 30789 + }, + { + "epoch": 0.3079, + "grad_norm": 0.8790867337610303, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 30790 + }, + { + "epoch": 0.30791, + "grad_norm": 0.9975166326202086, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 30791 + }, + { + "epoch": 0.30792, + "grad_norm": 1.2475447282973846, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 30792 + }, + { + "epoch": 0.30793, + "grad_norm": 0.7264304267212495, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 30793 + }, + { + "epoch": 0.30794, + "grad_norm": 0.7947401426509082, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 30794 + }, + { + "epoch": 0.30795, + "grad_norm": 0.8208642665880733, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 30795 + }, + { + "epoch": 0.30796, + "grad_norm": 0.7854857138475605, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 30796 + }, + { + "epoch": 0.30797, + "grad_norm": 0.7976923160247547, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 30797 + }, + { + "epoch": 0.30798, + "grad_norm": 0.9465109887178134, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 30798 + }, + { + "epoch": 0.30799, + "grad_norm": 1.0277431708472036, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 30799 + }, + { + "epoch": 0.308, + "grad_norm": 1.109523459814763, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 30800 + }, + { + "epoch": 0.30801, + "grad_norm": 0.9430055645442009, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 30801 + }, + { + "epoch": 0.30802, + "grad_norm": 0.9904230065172027, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 30802 + }, + { + "epoch": 0.30803, + "grad_norm": 0.9908632799235534, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 30803 + }, + { + "epoch": 0.30804, + "grad_norm": 0.9580612853285466, + "learning_rate": 0.003, + "loss": 4.032, + "step": 30804 + }, + { + "epoch": 0.30805, + "grad_norm": 0.9466218623375239, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 30805 + }, + { + "epoch": 0.30806, + "grad_norm": 0.9044946005622532, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 30806 + }, + { + "epoch": 0.30807, + "grad_norm": 0.759377359296469, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 30807 + }, + { + "epoch": 0.30808, + "grad_norm": 0.7360337601373378, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 30808 + }, + { + "epoch": 0.30809, + "grad_norm": 0.6797214623427076, + "learning_rate": 0.003, + "loss": 4.06, + "step": 30809 + }, + { + "epoch": 0.3081, + "grad_norm": 0.6745378555274573, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 30810 + }, + { + "epoch": 0.30811, + "grad_norm": 0.6566073207211455, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 30811 + }, + { + "epoch": 0.30812, + "grad_norm": 0.7250193120881936, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 30812 + }, + { + "epoch": 0.30813, + "grad_norm": 0.7072851276136148, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 30813 + }, + { + "epoch": 0.30814, + "grad_norm": 0.705444759086317, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 30814 + }, + { + "epoch": 0.30815, + "grad_norm": 0.785556256205228, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 30815 + }, + { + "epoch": 0.30816, + "grad_norm": 0.7948409605113511, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 30816 + }, + { + "epoch": 0.30817, + "grad_norm": 0.8212499143360835, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 30817 + }, + { + "epoch": 0.30818, + "grad_norm": 0.8466019317405559, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 30818 + }, + { + "epoch": 0.30819, + "grad_norm": 0.7824228247767014, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 30819 + }, + { + "epoch": 0.3082, + "grad_norm": 0.8231279167401168, + "learning_rate": 0.003, + "loss": 4.044, + "step": 30820 + }, + { + "epoch": 0.30821, + "grad_norm": 0.932067987771609, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 30821 + }, + { + "epoch": 0.30822, + "grad_norm": 0.9272212911225559, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 30822 + }, + { + "epoch": 0.30823, + "grad_norm": 0.7866391777583547, + "learning_rate": 0.003, + "loss": 4.032, + "step": 30823 + }, + { + "epoch": 0.30824, + "grad_norm": 0.7411285656144493, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 30824 + }, + { + "epoch": 0.30825, + "grad_norm": 0.7174124341762038, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 30825 + }, + { + "epoch": 0.30826, + "grad_norm": 0.7195212426240878, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 30826 + }, + { + "epoch": 0.30827, + "grad_norm": 0.6710068338457987, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 30827 + }, + { + "epoch": 0.30828, + "grad_norm": 0.6049136288262137, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 30828 + }, + { + "epoch": 0.30829, + "grad_norm": 0.5915762371102405, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 30829 + }, + { + "epoch": 0.3083, + "grad_norm": 0.6229245979850135, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 30830 + }, + { + "epoch": 0.30831, + "grad_norm": 0.6048280048742432, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 30831 + }, + { + "epoch": 0.30832, + "grad_norm": 0.6311498907802161, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 30832 + }, + { + "epoch": 0.30833, + "grad_norm": 0.6379437212409611, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 30833 + }, + { + "epoch": 0.30834, + "grad_norm": 0.6894403038237408, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 30834 + }, + { + "epoch": 0.30835, + "grad_norm": 0.7762573748993623, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 30835 + }, + { + "epoch": 0.30836, + "grad_norm": 0.9956761437422733, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 30836 + }, + { + "epoch": 0.30837, + "grad_norm": 1.3858113047805518, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 30837 + }, + { + "epoch": 0.30838, + "grad_norm": 0.8368330440204933, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 30838 + }, + { + "epoch": 0.30839, + "grad_norm": 0.703661452550211, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 30839 + }, + { + "epoch": 0.3084, + "grad_norm": 0.7062758242542257, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 30840 + }, + { + "epoch": 0.30841, + "grad_norm": 0.6470620033172044, + "learning_rate": 0.003, + "loss": 4.022, + "step": 30841 + }, + { + "epoch": 0.30842, + "grad_norm": 0.7397643359200448, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 30842 + }, + { + "epoch": 0.30843, + "grad_norm": 0.8105070688175162, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 30843 + }, + { + "epoch": 0.30844, + "grad_norm": 1.0636992493769797, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 30844 + }, + { + "epoch": 0.30845, + "grad_norm": 1.4280990489498842, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 30845 + }, + { + "epoch": 0.30846, + "grad_norm": 0.6928591669261425, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 30846 + }, + { + "epoch": 0.30847, + "grad_norm": 0.7076770874058445, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 30847 + }, + { + "epoch": 0.30848, + "grad_norm": 0.7743231000964611, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 30848 + }, + { + "epoch": 0.30849, + "grad_norm": 0.8675294369801922, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 30849 + }, + { + "epoch": 0.3085, + "grad_norm": 0.7675108237300807, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 30850 + }, + { + "epoch": 0.30851, + "grad_norm": 0.8459778347215771, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 30851 + }, + { + "epoch": 0.30852, + "grad_norm": 0.9763670540508455, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 30852 + }, + { + "epoch": 0.30853, + "grad_norm": 1.144642861624166, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 30853 + }, + { + "epoch": 0.30854, + "grad_norm": 0.9054883532072058, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 30854 + }, + { + "epoch": 0.30855, + "grad_norm": 0.8311714558514375, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 30855 + }, + { + "epoch": 0.30856, + "grad_norm": 0.8384225521014615, + "learning_rate": 0.003, + "loss": 4.021, + "step": 30856 + }, + { + "epoch": 0.30857, + "grad_norm": 0.8999811598207876, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 30857 + }, + { + "epoch": 0.30858, + "grad_norm": 1.0353681727919666, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 30858 + }, + { + "epoch": 0.30859, + "grad_norm": 0.9084499237406256, + "learning_rate": 0.003, + "loss": 4.043, + "step": 30859 + }, + { + "epoch": 0.3086, + "grad_norm": 0.7600951994651148, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 30860 + }, + { + "epoch": 0.30861, + "grad_norm": 0.6602037715887461, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 30861 + }, + { + "epoch": 0.30862, + "grad_norm": 0.7078148812033994, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 30862 + }, + { + "epoch": 0.30863, + "grad_norm": 0.914458865067099, + "learning_rate": 0.003, + "loss": 4.023, + "step": 30863 + }, + { + "epoch": 0.30864, + "grad_norm": 0.93483134601536, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 30864 + }, + { + "epoch": 0.30865, + "grad_norm": 0.7822020584614777, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 30865 + }, + { + "epoch": 0.30866, + "grad_norm": 0.7971823943460915, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 30866 + }, + { + "epoch": 0.30867, + "grad_norm": 0.9202323508184236, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 30867 + }, + { + "epoch": 0.30868, + "grad_norm": 1.1638717464557773, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 30868 + }, + { + "epoch": 0.30869, + "grad_norm": 1.1658812798223301, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 30869 + }, + { + "epoch": 0.3087, + "grad_norm": 0.9124421812408606, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 30870 + }, + { + "epoch": 0.30871, + "grad_norm": 0.9245718639081646, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 30871 + }, + { + "epoch": 0.30872, + "grad_norm": 1.0407781148622681, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 30872 + }, + { + "epoch": 0.30873, + "grad_norm": 1.0210650990759353, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 30873 + }, + { + "epoch": 0.30874, + "grad_norm": 0.9241671278028997, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 30874 + }, + { + "epoch": 0.30875, + "grad_norm": 0.7949661151547252, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 30875 + }, + { + "epoch": 0.30876, + "grad_norm": 0.7715586240858586, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 30876 + }, + { + "epoch": 0.30877, + "grad_norm": 0.976657661635659, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 30877 + }, + { + "epoch": 0.30878, + "grad_norm": 1.1563548355269564, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 30878 + }, + { + "epoch": 0.30879, + "grad_norm": 0.8792173900225312, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 30879 + }, + { + "epoch": 0.3088, + "grad_norm": 0.9716157978466045, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 30880 + }, + { + "epoch": 0.30881, + "grad_norm": 1.0947633268207957, + "learning_rate": 0.003, + "loss": 4.053, + "step": 30881 + }, + { + "epoch": 0.30882, + "grad_norm": 0.9817303479996596, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 30882 + }, + { + "epoch": 0.30883, + "grad_norm": 1.081444475160503, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 30883 + }, + { + "epoch": 0.30884, + "grad_norm": 0.867408073539162, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 30884 + }, + { + "epoch": 0.30885, + "grad_norm": 0.8747604860284365, + "learning_rate": 0.003, + "loss": 4.06, + "step": 30885 + }, + { + "epoch": 0.30886, + "grad_norm": 0.845118724240814, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 30886 + }, + { + "epoch": 0.30887, + "grad_norm": 0.7991888115608858, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 30887 + }, + { + "epoch": 0.30888, + "grad_norm": 0.7493543678681479, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 30888 + }, + { + "epoch": 0.30889, + "grad_norm": 0.7578823190588796, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 30889 + }, + { + "epoch": 0.3089, + "grad_norm": 0.8294631525847005, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 30890 + }, + { + "epoch": 0.30891, + "grad_norm": 0.7116140868217002, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 30891 + }, + { + "epoch": 0.30892, + "grad_norm": 0.654311368383866, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 30892 + }, + { + "epoch": 0.30893, + "grad_norm": 0.6774471319477997, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 30893 + }, + { + "epoch": 0.30894, + "grad_norm": 0.7047855297940542, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 30894 + }, + { + "epoch": 0.30895, + "grad_norm": 0.7092946895890737, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 30895 + }, + { + "epoch": 0.30896, + "grad_norm": 0.684974846546833, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 30896 + }, + { + "epoch": 0.30897, + "grad_norm": 0.7568743249712528, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 30897 + }, + { + "epoch": 0.30898, + "grad_norm": 1.078842950755968, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 30898 + }, + { + "epoch": 0.30899, + "grad_norm": 1.2599499182714489, + "learning_rate": 0.003, + "loss": 3.9964, + "step": 30899 + }, + { + "epoch": 0.309, + "grad_norm": 0.778162779113319, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 30900 + }, + { + "epoch": 0.30901, + "grad_norm": 0.7736141724046166, + "learning_rate": 0.003, + "loss": 4.048, + "step": 30901 + }, + { + "epoch": 0.30902, + "grad_norm": 0.864063956862464, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 30902 + }, + { + "epoch": 0.30903, + "grad_norm": 0.8314649489435436, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 30903 + }, + { + "epoch": 0.30904, + "grad_norm": 0.8560067482178059, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 30904 + }, + { + "epoch": 0.30905, + "grad_norm": 0.9500970616273259, + "learning_rate": 0.003, + "loss": 4.058, + "step": 30905 + }, + { + "epoch": 0.30906, + "grad_norm": 1.0445573042332406, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 30906 + }, + { + "epoch": 0.30907, + "grad_norm": 1.0786049859555682, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 30907 + }, + { + "epoch": 0.30908, + "grad_norm": 0.9407802372914399, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 30908 + }, + { + "epoch": 0.30909, + "grad_norm": 0.861749779455875, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 30909 + }, + { + "epoch": 0.3091, + "grad_norm": 0.9265995937642065, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 30910 + }, + { + "epoch": 0.30911, + "grad_norm": 0.8328510738699823, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 30911 + }, + { + "epoch": 0.30912, + "grad_norm": 0.6866715758244357, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 30912 + }, + { + "epoch": 0.30913, + "grad_norm": 0.6281154588420825, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 30913 + }, + { + "epoch": 0.30914, + "grad_norm": 0.5716326698369153, + "learning_rate": 0.003, + "loss": 3.9884, + "step": 30914 + }, + { + "epoch": 0.30915, + "grad_norm": 0.6910570005670672, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 30915 + }, + { + "epoch": 0.30916, + "grad_norm": 0.8043744295696714, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 30916 + }, + { + "epoch": 0.30917, + "grad_norm": 0.8849616687699069, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 30917 + }, + { + "epoch": 0.30918, + "grad_norm": 0.9119514383093178, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 30918 + }, + { + "epoch": 0.30919, + "grad_norm": 0.8393485831475815, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 30919 + }, + { + "epoch": 0.3092, + "grad_norm": 0.8601155065363455, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 30920 + }, + { + "epoch": 0.30921, + "grad_norm": 0.8677062198822795, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 30921 + }, + { + "epoch": 0.30922, + "grad_norm": 0.9365731305587933, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 30922 + }, + { + "epoch": 0.30923, + "grad_norm": 1.0871017199482813, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 30923 + }, + { + "epoch": 0.30924, + "grad_norm": 0.8500562001363362, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 30924 + }, + { + "epoch": 0.30925, + "grad_norm": 0.9464031272829073, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 30925 + }, + { + "epoch": 0.30926, + "grad_norm": 1.037790182972451, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 30926 + }, + { + "epoch": 0.30927, + "grad_norm": 0.9953459733870422, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 30927 + }, + { + "epoch": 0.30928, + "grad_norm": 0.9877682036439628, + "learning_rate": 0.003, + "loss": 4.048, + "step": 30928 + }, + { + "epoch": 0.30929, + "grad_norm": 0.8721875420586115, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 30929 + }, + { + "epoch": 0.3093, + "grad_norm": 0.7987285598352868, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 30930 + }, + { + "epoch": 0.30931, + "grad_norm": 0.8278858765422459, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 30931 + }, + { + "epoch": 0.30932, + "grad_norm": 0.8786533316448552, + "learning_rate": 0.003, + "loss": 4.038, + "step": 30932 + }, + { + "epoch": 0.30933, + "grad_norm": 0.977061469030929, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 30933 + }, + { + "epoch": 0.30934, + "grad_norm": 1.0469392450688666, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 30934 + }, + { + "epoch": 0.30935, + "grad_norm": 1.0680501192410035, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 30935 + }, + { + "epoch": 0.30936, + "grad_norm": 0.7741441716279966, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 30936 + }, + { + "epoch": 0.30937, + "grad_norm": 0.6731085510291004, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 30937 + }, + { + "epoch": 0.30938, + "grad_norm": 0.7686498915267687, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 30938 + }, + { + "epoch": 0.30939, + "grad_norm": 0.8081173299818549, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 30939 + }, + { + "epoch": 0.3094, + "grad_norm": 0.7104581117488192, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 30940 + }, + { + "epoch": 0.30941, + "grad_norm": 0.6948617356650509, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 30941 + }, + { + "epoch": 0.30942, + "grad_norm": 0.6962096529647628, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 30942 + }, + { + "epoch": 0.30943, + "grad_norm": 0.6740869619255537, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 30943 + }, + { + "epoch": 0.30944, + "grad_norm": 0.6707998718170635, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 30944 + }, + { + "epoch": 0.30945, + "grad_norm": 0.7060922173281504, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 30945 + }, + { + "epoch": 0.30946, + "grad_norm": 0.7144997641833017, + "learning_rate": 0.003, + "loss": 4.0032, + "step": 30946 + }, + { + "epoch": 0.30947, + "grad_norm": 0.7202735774920945, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 30947 + }, + { + "epoch": 0.30948, + "grad_norm": 0.8363543652847316, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 30948 + }, + { + "epoch": 0.30949, + "grad_norm": 1.0279347679092345, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 30949 + }, + { + "epoch": 0.3095, + "grad_norm": 1.153782803661401, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 30950 + }, + { + "epoch": 0.30951, + "grad_norm": 0.6971226568892515, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 30951 + }, + { + "epoch": 0.30952, + "grad_norm": 0.717740490118408, + "learning_rate": 0.003, + "loss": 4.0134, + "step": 30952 + }, + { + "epoch": 0.30953, + "grad_norm": 0.8717125098624835, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 30953 + }, + { + "epoch": 0.30954, + "grad_norm": 0.9389074051286843, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 30954 + }, + { + "epoch": 0.30955, + "grad_norm": 0.8891870187625002, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 30955 + }, + { + "epoch": 0.30956, + "grad_norm": 0.8001640179460102, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 30956 + }, + { + "epoch": 0.30957, + "grad_norm": 0.8525564035168054, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 30957 + }, + { + "epoch": 0.30958, + "grad_norm": 0.8210509522842179, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 30958 + }, + { + "epoch": 0.30959, + "grad_norm": 0.6982756881635637, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 30959 + }, + { + "epoch": 0.3096, + "grad_norm": 0.6678187264713864, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 30960 + }, + { + "epoch": 0.30961, + "grad_norm": 0.7175310383546256, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 30961 + }, + { + "epoch": 0.30962, + "grad_norm": 0.8758024362543068, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 30962 + }, + { + "epoch": 0.30963, + "grad_norm": 1.055418853626481, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 30963 + }, + { + "epoch": 0.30964, + "grad_norm": 1.0413565293167013, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 30964 + }, + { + "epoch": 0.30965, + "grad_norm": 1.161427818455989, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 30965 + }, + { + "epoch": 0.30966, + "grad_norm": 0.820736439227036, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 30966 + }, + { + "epoch": 0.30967, + "grad_norm": 0.6556348644330107, + "learning_rate": 0.003, + "loss": 4.022, + "step": 30967 + }, + { + "epoch": 0.30968, + "grad_norm": 0.682304018705786, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 30968 + }, + { + "epoch": 0.30969, + "grad_norm": 0.7048564642048618, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 30969 + }, + { + "epoch": 0.3097, + "grad_norm": 0.745457284050935, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 30970 + }, + { + "epoch": 0.30971, + "grad_norm": 0.7828806572321504, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 30971 + }, + { + "epoch": 0.30972, + "grad_norm": 0.9286712905943237, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 30972 + }, + { + "epoch": 0.30973, + "grad_norm": 1.0864493218562645, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 30973 + }, + { + "epoch": 0.30974, + "grad_norm": 0.9710078484457113, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 30974 + }, + { + "epoch": 0.30975, + "grad_norm": 1.1108268964741557, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 30975 + }, + { + "epoch": 0.30976, + "grad_norm": 1.091960849735545, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 30976 + }, + { + "epoch": 0.30977, + "grad_norm": 1.006376369751325, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 30977 + }, + { + "epoch": 0.30978, + "grad_norm": 0.8294378277833414, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 30978 + }, + { + "epoch": 0.30979, + "grad_norm": 0.6829509783813068, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 30979 + }, + { + "epoch": 0.3098, + "grad_norm": 0.7442709759447237, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 30980 + }, + { + "epoch": 0.30981, + "grad_norm": 0.885562801073043, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 30981 + }, + { + "epoch": 0.30982, + "grad_norm": 0.9573031245840209, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 30982 + }, + { + "epoch": 0.30983, + "grad_norm": 1.026908261272219, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 30983 + }, + { + "epoch": 0.30984, + "grad_norm": 0.9462754429563887, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 30984 + }, + { + "epoch": 0.30985, + "grad_norm": 0.9680137468194956, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 30985 + }, + { + "epoch": 0.30986, + "grad_norm": 1.0837831790425496, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 30986 + }, + { + "epoch": 0.30987, + "grad_norm": 0.926782225671113, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 30987 + }, + { + "epoch": 0.30988, + "grad_norm": 0.9847682386833083, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 30988 + }, + { + "epoch": 0.30989, + "grad_norm": 1.0284223130794317, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 30989 + }, + { + "epoch": 0.3099, + "grad_norm": 0.8803483620370607, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 30990 + }, + { + "epoch": 0.30991, + "grad_norm": 0.8339326251242171, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 30991 + }, + { + "epoch": 0.30992, + "grad_norm": 0.7892997591454144, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 30992 + }, + { + "epoch": 0.30993, + "grad_norm": 0.811248594174698, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 30993 + }, + { + "epoch": 0.30994, + "grad_norm": 0.8062876234992918, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 30994 + }, + { + "epoch": 0.30995, + "grad_norm": 0.7518451160126378, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 30995 + }, + { + "epoch": 0.30996, + "grad_norm": 0.6559777697465964, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 30996 + }, + { + "epoch": 0.30997, + "grad_norm": 0.605975918738231, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 30997 + }, + { + "epoch": 0.30998, + "grad_norm": 0.671370272773477, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 30998 + }, + { + "epoch": 0.30999, + "grad_norm": 0.6709486752236141, + "learning_rate": 0.003, + "loss": 4.033, + "step": 30999 + }, + { + "epoch": 0.31, + "grad_norm": 0.7338335374064964, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 31000 + }, + { + "epoch": 0.31001, + "grad_norm": 0.835707778809499, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 31001 + }, + { + "epoch": 0.31002, + "grad_norm": 1.0131585189066932, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 31002 + }, + { + "epoch": 0.31003, + "grad_norm": 1.2302812820593076, + "learning_rate": 0.003, + "loss": 4.027, + "step": 31003 + }, + { + "epoch": 0.31004, + "grad_norm": 0.7293395447434757, + "learning_rate": 0.003, + "loss": 4.033, + "step": 31004 + }, + { + "epoch": 0.31005, + "grad_norm": 0.6551589996734186, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 31005 + }, + { + "epoch": 0.31006, + "grad_norm": 0.6339743227078345, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 31006 + }, + { + "epoch": 0.31007, + "grad_norm": 0.596116908478433, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 31007 + }, + { + "epoch": 0.31008, + "grad_norm": 0.5515872747727149, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 31008 + }, + { + "epoch": 0.31009, + "grad_norm": 0.5287433989556892, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 31009 + }, + { + "epoch": 0.3101, + "grad_norm": 0.5746812971715765, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 31010 + }, + { + "epoch": 0.31011, + "grad_norm": 0.7118937154303365, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 31011 + }, + { + "epoch": 0.31012, + "grad_norm": 0.8162206969463668, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 31012 + }, + { + "epoch": 0.31013, + "grad_norm": 0.7548004345470204, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 31013 + }, + { + "epoch": 0.31014, + "grad_norm": 0.6830373572298705, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 31014 + }, + { + "epoch": 0.31015, + "grad_norm": 0.7012547541580102, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 31015 + }, + { + "epoch": 0.31016, + "grad_norm": 0.7553299293266569, + "learning_rate": 0.003, + "loss": 4.028, + "step": 31016 + }, + { + "epoch": 0.31017, + "grad_norm": 0.8339908407659491, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 31017 + }, + { + "epoch": 0.31018, + "grad_norm": 1.0224020663266091, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 31018 + }, + { + "epoch": 0.31019, + "grad_norm": 1.0961216129609759, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 31019 + }, + { + "epoch": 0.3102, + "grad_norm": 0.932002824376987, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 31020 + }, + { + "epoch": 0.31021, + "grad_norm": 0.9687302484483488, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 31021 + }, + { + "epoch": 0.31022, + "grad_norm": 0.9552239547163932, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 31022 + }, + { + "epoch": 0.31023, + "grad_norm": 1.257045589159289, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 31023 + }, + { + "epoch": 0.31024, + "grad_norm": 0.9853366323513992, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 31024 + }, + { + "epoch": 0.31025, + "grad_norm": 0.9317811307956058, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 31025 + }, + { + "epoch": 0.31026, + "grad_norm": 0.9484528903511715, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 31026 + }, + { + "epoch": 0.31027, + "grad_norm": 1.0703352376046482, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 31027 + }, + { + "epoch": 0.31028, + "grad_norm": 1.0817165846514012, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 31028 + }, + { + "epoch": 0.31029, + "grad_norm": 0.8139267355162888, + "learning_rate": 0.003, + "loss": 4.04, + "step": 31029 + }, + { + "epoch": 0.3103, + "grad_norm": 0.7798121267458615, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 31030 + }, + { + "epoch": 0.31031, + "grad_norm": 0.8224564797754106, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 31031 + }, + { + "epoch": 0.31032, + "grad_norm": 0.7570758717756095, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 31032 + }, + { + "epoch": 0.31033, + "grad_norm": 0.7576721999266691, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 31033 + }, + { + "epoch": 0.31034, + "grad_norm": 0.8064131656169007, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 31034 + }, + { + "epoch": 0.31035, + "grad_norm": 0.9081472479934298, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 31035 + }, + { + "epoch": 0.31036, + "grad_norm": 1.0124863061071248, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 31036 + }, + { + "epoch": 0.31037, + "grad_norm": 1.2004888205712148, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 31037 + }, + { + "epoch": 0.31038, + "grad_norm": 0.885908380875526, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 31038 + }, + { + "epoch": 0.31039, + "grad_norm": 0.7946190508781668, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 31039 + }, + { + "epoch": 0.3104, + "grad_norm": 0.8398811895755552, + "learning_rate": 0.003, + "loss": 4.046, + "step": 31040 + }, + { + "epoch": 0.31041, + "grad_norm": 0.8079893266326794, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 31041 + }, + { + "epoch": 0.31042, + "grad_norm": 0.7912827310592263, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 31042 + }, + { + "epoch": 0.31043, + "grad_norm": 0.9084398130608581, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 31043 + }, + { + "epoch": 0.31044, + "grad_norm": 0.947306554734898, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 31044 + }, + { + "epoch": 0.31045, + "grad_norm": 0.8358163981934753, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 31045 + }, + { + "epoch": 0.31046, + "grad_norm": 0.7817003343741666, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 31046 + }, + { + "epoch": 0.31047, + "grad_norm": 0.6613145028427724, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 31047 + }, + { + "epoch": 0.31048, + "grad_norm": 0.6936189603118382, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 31048 + }, + { + "epoch": 0.31049, + "grad_norm": 0.7590241324708386, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 31049 + }, + { + "epoch": 0.3105, + "grad_norm": 0.7132149902605485, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 31050 + }, + { + "epoch": 0.31051, + "grad_norm": 0.7856538510872045, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 31051 + }, + { + "epoch": 0.31052, + "grad_norm": 1.0454499292239772, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 31052 + }, + { + "epoch": 0.31053, + "grad_norm": 1.216803873518095, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 31053 + }, + { + "epoch": 0.31054, + "grad_norm": 0.686016305284535, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 31054 + }, + { + "epoch": 0.31055, + "grad_norm": 0.649826183521942, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 31055 + }, + { + "epoch": 0.31056, + "grad_norm": 0.6965922841821345, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 31056 + }, + { + "epoch": 0.31057, + "grad_norm": 0.6893290408468875, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 31057 + }, + { + "epoch": 0.31058, + "grad_norm": 0.689710327966547, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 31058 + }, + { + "epoch": 0.31059, + "grad_norm": 0.7078059697593403, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 31059 + }, + { + "epoch": 0.3106, + "grad_norm": 0.7266861777177281, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 31060 + }, + { + "epoch": 0.31061, + "grad_norm": 1.014360325459925, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 31061 + }, + { + "epoch": 0.31062, + "grad_norm": 1.2100780791637273, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 31062 + }, + { + "epoch": 0.31063, + "grad_norm": 0.7814388638722781, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 31063 + }, + { + "epoch": 0.31064, + "grad_norm": 0.7987441523062871, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 31064 + }, + { + "epoch": 0.31065, + "grad_norm": 0.8063535819177563, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 31065 + }, + { + "epoch": 0.31066, + "grad_norm": 0.8790037433793394, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 31066 + }, + { + "epoch": 0.31067, + "grad_norm": 1.0004837095558254, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 31067 + }, + { + "epoch": 0.31068, + "grad_norm": 0.9883808148319644, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 31068 + }, + { + "epoch": 0.31069, + "grad_norm": 0.8481105701572573, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 31069 + }, + { + "epoch": 0.3107, + "grad_norm": 0.8537340899326175, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 31070 + }, + { + "epoch": 0.31071, + "grad_norm": 0.8679776259807989, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 31071 + }, + { + "epoch": 0.31072, + "grad_norm": 0.9282566013463376, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 31072 + }, + { + "epoch": 0.31073, + "grad_norm": 0.9032554650560465, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 31073 + }, + { + "epoch": 0.31074, + "grad_norm": 0.8712692842079965, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 31074 + }, + { + "epoch": 0.31075, + "grad_norm": 0.9278415717566548, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 31075 + }, + { + "epoch": 0.31076, + "grad_norm": 1.0005566571876519, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 31076 + }, + { + "epoch": 0.31077, + "grad_norm": 1.157025356897531, + "learning_rate": 0.003, + "loss": 4.059, + "step": 31077 + }, + { + "epoch": 0.31078, + "grad_norm": 0.9825780881863968, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 31078 + }, + { + "epoch": 0.31079, + "grad_norm": 0.9130960374629797, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 31079 + }, + { + "epoch": 0.3108, + "grad_norm": 0.7799005755232936, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 31080 + }, + { + "epoch": 0.31081, + "grad_norm": 0.8526306648918017, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 31081 + }, + { + "epoch": 0.31082, + "grad_norm": 1.0947067777263588, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 31082 + }, + { + "epoch": 0.31083, + "grad_norm": 1.1673920614488493, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 31083 + }, + { + "epoch": 0.31084, + "grad_norm": 0.7377706791397789, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 31084 + }, + { + "epoch": 0.31085, + "grad_norm": 0.6411479960245204, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 31085 + }, + { + "epoch": 0.31086, + "grad_norm": 0.6203767068765476, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 31086 + }, + { + "epoch": 0.31087, + "grad_norm": 0.6627128439228822, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 31087 + }, + { + "epoch": 0.31088, + "grad_norm": 0.6324818538763578, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 31088 + }, + { + "epoch": 0.31089, + "grad_norm": 0.5448869350551743, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 31089 + }, + { + "epoch": 0.3109, + "grad_norm": 0.4964369232111863, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 31090 + }, + { + "epoch": 0.31091, + "grad_norm": 0.4623353225491741, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 31091 + }, + { + "epoch": 0.31092, + "grad_norm": 0.5268667654976494, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 31092 + }, + { + "epoch": 0.31093, + "grad_norm": 0.5374996747579416, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 31093 + }, + { + "epoch": 0.31094, + "grad_norm": 0.5861042164616919, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 31094 + }, + { + "epoch": 0.31095, + "grad_norm": 0.5795076824572069, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 31095 + }, + { + "epoch": 0.31096, + "grad_norm": 0.5924979126186535, + "learning_rate": 0.003, + "loss": 4.018, + "step": 31096 + }, + { + "epoch": 0.31097, + "grad_norm": 0.6394316032476024, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 31097 + }, + { + "epoch": 0.31098, + "grad_norm": 0.7502514250773625, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 31098 + }, + { + "epoch": 0.31099, + "grad_norm": 1.203459635573585, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 31099 + }, + { + "epoch": 0.311, + "grad_norm": 1.3422316334643631, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 31100 + }, + { + "epoch": 0.31101, + "grad_norm": 0.6338723194433382, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 31101 + }, + { + "epoch": 0.31102, + "grad_norm": 0.7626809188218358, + "learning_rate": 0.003, + "loss": 4.003, + "step": 31102 + }, + { + "epoch": 0.31103, + "grad_norm": 0.8721442591783304, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 31103 + }, + { + "epoch": 0.31104, + "grad_norm": 0.9059819078573021, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 31104 + }, + { + "epoch": 0.31105, + "grad_norm": 0.8776106723378734, + "learning_rate": 0.003, + "loss": 4.0059, + "step": 31105 + }, + { + "epoch": 0.31106, + "grad_norm": 0.8368024936953757, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 31106 + }, + { + "epoch": 0.31107, + "grad_norm": 0.7350101081747782, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 31107 + }, + { + "epoch": 0.31108, + "grad_norm": 0.7820924972807999, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 31108 + }, + { + "epoch": 0.31109, + "grad_norm": 0.9627609301477671, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 31109 + }, + { + "epoch": 0.3111, + "grad_norm": 1.1433743004983135, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 31110 + }, + { + "epoch": 0.31111, + "grad_norm": 1.102115815977687, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 31111 + }, + { + "epoch": 0.31112, + "grad_norm": 1.163092415598571, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 31112 + }, + { + "epoch": 0.31113, + "grad_norm": 0.8018394827470376, + "learning_rate": 0.003, + "loss": 4.048, + "step": 31113 + }, + { + "epoch": 0.31114, + "grad_norm": 0.8417857953568431, + "learning_rate": 0.003, + "loss": 4.019, + "step": 31114 + }, + { + "epoch": 0.31115, + "grad_norm": 1.006621112755898, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 31115 + }, + { + "epoch": 0.31116, + "grad_norm": 1.0530222688174324, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 31116 + }, + { + "epoch": 0.31117, + "grad_norm": 0.8205619452524133, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 31117 + }, + { + "epoch": 0.31118, + "grad_norm": 0.9097267082533321, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 31118 + }, + { + "epoch": 0.31119, + "grad_norm": 0.835064870487331, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 31119 + }, + { + "epoch": 0.3112, + "grad_norm": 0.8352121026036825, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 31120 + }, + { + "epoch": 0.31121, + "grad_norm": 0.9050982692869832, + "learning_rate": 0.003, + "loss": 4.055, + "step": 31121 + }, + { + "epoch": 0.31122, + "grad_norm": 1.0397825182148517, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 31122 + }, + { + "epoch": 0.31123, + "grad_norm": 1.0114958501051587, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 31123 + }, + { + "epoch": 0.31124, + "grad_norm": 1.0248608227633966, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 31124 + }, + { + "epoch": 0.31125, + "grad_norm": 0.9472824474811777, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 31125 + }, + { + "epoch": 0.31126, + "grad_norm": 0.7461692781053041, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 31126 + }, + { + "epoch": 0.31127, + "grad_norm": 0.6113110203431844, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 31127 + }, + { + "epoch": 0.31128, + "grad_norm": 0.607752402401117, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 31128 + }, + { + "epoch": 0.31129, + "grad_norm": 0.6592647692413477, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 31129 + }, + { + "epoch": 0.3113, + "grad_norm": 0.7907267354504305, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 31130 + }, + { + "epoch": 0.31131, + "grad_norm": 0.8905261796330373, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 31131 + }, + { + "epoch": 0.31132, + "grad_norm": 0.996129675215651, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 31132 + }, + { + "epoch": 0.31133, + "grad_norm": 1.1016336688174444, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 31133 + }, + { + "epoch": 0.31134, + "grad_norm": 0.9922264625070881, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 31134 + }, + { + "epoch": 0.31135, + "grad_norm": 1.009947512332734, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 31135 + }, + { + "epoch": 0.31136, + "grad_norm": 0.8769436963227613, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 31136 + }, + { + "epoch": 0.31137, + "grad_norm": 0.7014712809017916, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 31137 + }, + { + "epoch": 0.31138, + "grad_norm": 0.702218812811567, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 31138 + }, + { + "epoch": 0.31139, + "grad_norm": 0.7439306043055479, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 31139 + }, + { + "epoch": 0.3114, + "grad_norm": 0.9224585574994658, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 31140 + }, + { + "epoch": 0.31141, + "grad_norm": 1.0446365990854478, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 31141 + }, + { + "epoch": 0.31142, + "grad_norm": 1.105236990680016, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 31142 + }, + { + "epoch": 0.31143, + "grad_norm": 1.0396398340126907, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 31143 + }, + { + "epoch": 0.31144, + "grad_norm": 0.8302062274489633, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 31144 + }, + { + "epoch": 0.31145, + "grad_norm": 0.7613756825808332, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 31145 + }, + { + "epoch": 0.31146, + "grad_norm": 0.8546883548982939, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 31146 + }, + { + "epoch": 0.31147, + "grad_norm": 1.0161584344289527, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 31147 + }, + { + "epoch": 0.31148, + "grad_norm": 1.083246619995139, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 31148 + }, + { + "epoch": 0.31149, + "grad_norm": 0.9901365152875319, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 31149 + }, + { + "epoch": 0.3115, + "grad_norm": 1.1206056175869872, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 31150 + }, + { + "epoch": 0.31151, + "grad_norm": 0.8388386088619617, + "learning_rate": 0.003, + "loss": 4.042, + "step": 31151 + }, + { + "epoch": 0.31152, + "grad_norm": 0.8404325325388726, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 31152 + }, + { + "epoch": 0.31153, + "grad_norm": 0.7588165159526111, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 31153 + }, + { + "epoch": 0.31154, + "grad_norm": 0.7769497270635726, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 31154 + }, + { + "epoch": 0.31155, + "grad_norm": 0.8877061047103989, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 31155 + }, + { + "epoch": 0.31156, + "grad_norm": 0.908931265495274, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 31156 + }, + { + "epoch": 0.31157, + "grad_norm": 1.121312814408468, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 31157 + }, + { + "epoch": 0.31158, + "grad_norm": 1.0555540397999081, + "learning_rate": 0.003, + "loss": 4.042, + "step": 31158 + }, + { + "epoch": 0.31159, + "grad_norm": 0.8203079032313007, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 31159 + }, + { + "epoch": 0.3116, + "grad_norm": 0.8161678281528759, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 31160 + }, + { + "epoch": 0.31161, + "grad_norm": 0.8143157120474863, + "learning_rate": 0.003, + "loss": 4.023, + "step": 31161 + }, + { + "epoch": 0.31162, + "grad_norm": 0.7004886376149924, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 31162 + }, + { + "epoch": 0.31163, + "grad_norm": 0.7083221034778194, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 31163 + }, + { + "epoch": 0.31164, + "grad_norm": 0.6261289049636733, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 31164 + }, + { + "epoch": 0.31165, + "grad_norm": 0.6291110112905594, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 31165 + }, + { + "epoch": 0.31166, + "grad_norm": 0.7116744880511073, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 31166 + }, + { + "epoch": 0.31167, + "grad_norm": 0.8192851469984462, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 31167 + }, + { + "epoch": 0.31168, + "grad_norm": 0.947055552564821, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 31168 + }, + { + "epoch": 0.31169, + "grad_norm": 0.9857294685078094, + "learning_rate": 0.003, + "loss": 4.07, + "step": 31169 + }, + { + "epoch": 0.3117, + "grad_norm": 0.8594826753007103, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 31170 + }, + { + "epoch": 0.31171, + "grad_norm": 0.6354570199502176, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 31171 + }, + { + "epoch": 0.31172, + "grad_norm": 0.6753245339627981, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 31172 + }, + { + "epoch": 0.31173, + "grad_norm": 0.6433501948393092, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 31173 + }, + { + "epoch": 0.31174, + "grad_norm": 0.5653324812738522, + "learning_rate": 0.003, + "loss": 4.03, + "step": 31174 + }, + { + "epoch": 0.31175, + "grad_norm": 0.6040435639652715, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 31175 + }, + { + "epoch": 0.31176, + "grad_norm": 0.6337658695834149, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 31176 + }, + { + "epoch": 0.31177, + "grad_norm": 0.667875234211774, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 31177 + }, + { + "epoch": 0.31178, + "grad_norm": 0.7578344412188126, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 31178 + }, + { + "epoch": 0.31179, + "grad_norm": 1.0074489125803223, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 31179 + }, + { + "epoch": 0.3118, + "grad_norm": 1.2710010078722682, + "learning_rate": 0.003, + "loss": 4.0058, + "step": 31180 + }, + { + "epoch": 0.31181, + "grad_norm": 0.8532903662863325, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 31181 + }, + { + "epoch": 0.31182, + "grad_norm": 0.6950093586071396, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 31182 + }, + { + "epoch": 0.31183, + "grad_norm": 0.7659615971190149, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 31183 + }, + { + "epoch": 0.31184, + "grad_norm": 0.7709934089389846, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 31184 + }, + { + "epoch": 0.31185, + "grad_norm": 0.8931813774604814, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 31185 + }, + { + "epoch": 0.31186, + "grad_norm": 0.9024451847565017, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 31186 + }, + { + "epoch": 0.31187, + "grad_norm": 1.0764724245940203, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 31187 + }, + { + "epoch": 0.31188, + "grad_norm": 1.0566140651124067, + "learning_rate": 0.003, + "loss": 3.9906, + "step": 31188 + }, + { + "epoch": 0.31189, + "grad_norm": 0.94519639485893, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 31189 + }, + { + "epoch": 0.3119, + "grad_norm": 0.9445190421239597, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 31190 + }, + { + "epoch": 0.31191, + "grad_norm": 0.8960995302646945, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 31191 + }, + { + "epoch": 0.31192, + "grad_norm": 0.9604325377420415, + "learning_rate": 0.003, + "loss": 4.044, + "step": 31192 + }, + { + "epoch": 0.31193, + "grad_norm": 0.853307273334864, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 31193 + }, + { + "epoch": 0.31194, + "grad_norm": 0.8203948862762644, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 31194 + }, + { + "epoch": 0.31195, + "grad_norm": 0.8259406214037516, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 31195 + }, + { + "epoch": 0.31196, + "grad_norm": 0.8213421034820623, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 31196 + }, + { + "epoch": 0.31197, + "grad_norm": 0.8222416138652324, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 31197 + }, + { + "epoch": 0.31198, + "grad_norm": 0.7855947038427131, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 31198 + }, + { + "epoch": 0.31199, + "grad_norm": 0.7799791970512702, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 31199 + }, + { + "epoch": 0.312, + "grad_norm": 0.7222829207529543, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 31200 + }, + { + "epoch": 0.31201, + "grad_norm": 0.7044800552835228, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 31201 + }, + { + "epoch": 0.31202, + "grad_norm": 0.8104473490382206, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 31202 + }, + { + "epoch": 0.31203, + "grad_norm": 0.9538999567272132, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 31203 + }, + { + "epoch": 0.31204, + "grad_norm": 1.1508904934712019, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 31204 + }, + { + "epoch": 0.31205, + "grad_norm": 0.7259607346237986, + "learning_rate": 0.003, + "loss": 4.038, + "step": 31205 + }, + { + "epoch": 0.31206, + "grad_norm": 0.5886160340174087, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 31206 + }, + { + "epoch": 0.31207, + "grad_norm": 0.6046105316961712, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 31207 + }, + { + "epoch": 0.31208, + "grad_norm": 0.7286513798323639, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 31208 + }, + { + "epoch": 0.31209, + "grad_norm": 0.8767145849286827, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 31209 + }, + { + "epoch": 0.3121, + "grad_norm": 0.9133316611541883, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 31210 + }, + { + "epoch": 0.31211, + "grad_norm": 0.9176336056733225, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 31211 + }, + { + "epoch": 0.31212, + "grad_norm": 1.143458434556926, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 31212 + }, + { + "epoch": 0.31213, + "grad_norm": 1.0104344144223345, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 31213 + }, + { + "epoch": 0.31214, + "grad_norm": 0.8941844385027341, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 31214 + }, + { + "epoch": 0.31215, + "grad_norm": 0.9712433776199452, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 31215 + }, + { + "epoch": 0.31216, + "grad_norm": 0.9931854447349524, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 31216 + }, + { + "epoch": 0.31217, + "grad_norm": 1.0769058390328516, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 31217 + }, + { + "epoch": 0.31218, + "grad_norm": 0.967013220301073, + "learning_rate": 0.003, + "loss": 4.076, + "step": 31218 + }, + { + "epoch": 0.31219, + "grad_norm": 1.0516868253101601, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 31219 + }, + { + "epoch": 0.3122, + "grad_norm": 0.9577932148076674, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 31220 + }, + { + "epoch": 0.31221, + "grad_norm": 0.8959379395188523, + "learning_rate": 0.003, + "loss": 4.055, + "step": 31221 + }, + { + "epoch": 0.31222, + "grad_norm": 0.8763575877702239, + "learning_rate": 0.003, + "loss": 4.0033, + "step": 31222 + }, + { + "epoch": 0.31223, + "grad_norm": 0.8000503880564412, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 31223 + }, + { + "epoch": 0.31224, + "grad_norm": 0.9268896245523388, + "learning_rate": 0.003, + "loss": 4.059, + "step": 31224 + }, + { + "epoch": 0.31225, + "grad_norm": 0.9368815704520915, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 31225 + }, + { + "epoch": 0.31226, + "grad_norm": 0.946862214047098, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 31226 + }, + { + "epoch": 0.31227, + "grad_norm": 0.8608209631936689, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 31227 + }, + { + "epoch": 0.31228, + "grad_norm": 0.7189427748260587, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 31228 + }, + { + "epoch": 0.31229, + "grad_norm": 0.7235214968231489, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 31229 + }, + { + "epoch": 0.3123, + "grad_norm": 0.6878232450952642, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 31230 + }, + { + "epoch": 0.31231, + "grad_norm": 0.8090304665472239, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 31231 + }, + { + "epoch": 0.31232, + "grad_norm": 0.9297765536479975, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 31232 + }, + { + "epoch": 0.31233, + "grad_norm": 0.9699722151862058, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 31233 + }, + { + "epoch": 0.31234, + "grad_norm": 1.0025669497327474, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 31234 + }, + { + "epoch": 0.31235, + "grad_norm": 1.1092291259320062, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 31235 + }, + { + "epoch": 0.31236, + "grad_norm": 0.8202534226358841, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 31236 + }, + { + "epoch": 0.31237, + "grad_norm": 0.7975553984533527, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 31237 + }, + { + "epoch": 0.31238, + "grad_norm": 0.8256148091172824, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 31238 + }, + { + "epoch": 0.31239, + "grad_norm": 0.7484664067909651, + "learning_rate": 0.003, + "loss": 3.9989, + "step": 31239 + }, + { + "epoch": 0.3124, + "grad_norm": 0.581875811854712, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 31240 + }, + { + "epoch": 0.31241, + "grad_norm": 0.4955379386464406, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 31241 + }, + { + "epoch": 0.31242, + "grad_norm": 0.5439994103430057, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 31242 + }, + { + "epoch": 0.31243, + "grad_norm": 0.5613986779769449, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 31243 + }, + { + "epoch": 0.31244, + "grad_norm": 0.5636587556106651, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 31244 + }, + { + "epoch": 0.31245, + "grad_norm": 0.6373450148437212, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 31245 + }, + { + "epoch": 0.31246, + "grad_norm": 0.6811196392484467, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 31246 + }, + { + "epoch": 0.31247, + "grad_norm": 0.8775198527097631, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 31247 + }, + { + "epoch": 0.31248, + "grad_norm": 1.1292727150103001, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 31248 + }, + { + "epoch": 0.31249, + "grad_norm": 1.06569438341617, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 31249 + }, + { + "epoch": 0.3125, + "grad_norm": 0.9780876224448649, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 31250 + }, + { + "epoch": 0.31251, + "grad_norm": 0.8691640580063306, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 31251 + }, + { + "epoch": 0.31252, + "grad_norm": 0.9031264123239988, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 31252 + }, + { + "epoch": 0.31253, + "grad_norm": 0.8440934592328118, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 31253 + }, + { + "epoch": 0.31254, + "grad_norm": 0.9136346820374134, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 31254 + }, + { + "epoch": 0.31255, + "grad_norm": 0.9776081752405139, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 31255 + }, + { + "epoch": 0.31256, + "grad_norm": 1.013687280085767, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 31256 + }, + { + "epoch": 0.31257, + "grad_norm": 0.8211032359131123, + "learning_rate": 0.003, + "loss": 4.061, + "step": 31257 + }, + { + "epoch": 0.31258, + "grad_norm": 0.8214192351733371, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 31258 + }, + { + "epoch": 0.31259, + "grad_norm": 0.7452837584323339, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 31259 + }, + { + "epoch": 0.3126, + "grad_norm": 0.7750382116581828, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 31260 + }, + { + "epoch": 0.31261, + "grad_norm": 0.9493108584886132, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 31261 + }, + { + "epoch": 0.31262, + "grad_norm": 1.320170341017054, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 31262 + }, + { + "epoch": 0.31263, + "grad_norm": 0.8257790432990456, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 31263 + }, + { + "epoch": 0.31264, + "grad_norm": 0.9215070394140378, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 31264 + }, + { + "epoch": 0.31265, + "grad_norm": 0.9617440958599551, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 31265 + }, + { + "epoch": 0.31266, + "grad_norm": 0.9612800516884812, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 31266 + }, + { + "epoch": 0.31267, + "grad_norm": 0.8592863979003663, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 31267 + }, + { + "epoch": 0.31268, + "grad_norm": 0.7474252747736044, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 31268 + }, + { + "epoch": 0.31269, + "grad_norm": 0.6926686736824768, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 31269 + }, + { + "epoch": 0.3127, + "grad_norm": 0.6648553849500414, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 31270 + }, + { + "epoch": 0.31271, + "grad_norm": 0.6503654348039692, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 31271 + }, + { + "epoch": 0.31272, + "grad_norm": 0.6955850998442819, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 31272 + }, + { + "epoch": 0.31273, + "grad_norm": 0.9234904276314626, + "learning_rate": 0.003, + "loss": 4.052, + "step": 31273 + }, + { + "epoch": 0.31274, + "grad_norm": 1.423790160547063, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 31274 + }, + { + "epoch": 0.31275, + "grad_norm": 0.5805242330291865, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 31275 + }, + { + "epoch": 0.31276, + "grad_norm": 0.7735977638447247, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 31276 + }, + { + "epoch": 0.31277, + "grad_norm": 0.9734044611289877, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 31277 + }, + { + "epoch": 0.31278, + "grad_norm": 0.9854708059823793, + "learning_rate": 0.003, + "loss": 4.043, + "step": 31278 + }, + { + "epoch": 0.31279, + "grad_norm": 0.951508274795993, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 31279 + }, + { + "epoch": 0.3128, + "grad_norm": 0.7998936112835342, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 31280 + }, + { + "epoch": 0.31281, + "grad_norm": 0.6284474638940004, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 31281 + }, + { + "epoch": 0.31282, + "grad_norm": 0.576370437160648, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 31282 + }, + { + "epoch": 0.31283, + "grad_norm": 0.4850312343434524, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 31283 + }, + { + "epoch": 0.31284, + "grad_norm": 0.5172173956555511, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 31284 + }, + { + "epoch": 0.31285, + "grad_norm": 0.5685522769723065, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 31285 + }, + { + "epoch": 0.31286, + "grad_norm": 0.5742093413699484, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 31286 + }, + { + "epoch": 0.31287, + "grad_norm": 0.6084592366537805, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 31287 + }, + { + "epoch": 0.31288, + "grad_norm": 0.7479579936070707, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 31288 + }, + { + "epoch": 0.31289, + "grad_norm": 1.1496538538413394, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 31289 + }, + { + "epoch": 0.3129, + "grad_norm": 1.0361776798490294, + "learning_rate": 0.003, + "loss": 4.049, + "step": 31290 + }, + { + "epoch": 0.31291, + "grad_norm": 0.8864992519658409, + "learning_rate": 0.003, + "loss": 4.0066, + "step": 31291 + }, + { + "epoch": 0.31292, + "grad_norm": 0.9252412217776611, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 31292 + }, + { + "epoch": 0.31293, + "grad_norm": 0.9590487822230165, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 31293 + }, + { + "epoch": 0.31294, + "grad_norm": 1.057522904484121, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 31294 + }, + { + "epoch": 0.31295, + "grad_norm": 0.9740089651848963, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 31295 + }, + { + "epoch": 0.31296, + "grad_norm": 1.0988298714726008, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 31296 + }, + { + "epoch": 0.31297, + "grad_norm": 0.968915533076145, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 31297 + }, + { + "epoch": 0.31298, + "grad_norm": 1.0196908600993277, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 31298 + }, + { + "epoch": 0.31299, + "grad_norm": 1.0330180172036938, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 31299 + }, + { + "epoch": 0.313, + "grad_norm": 1.189443203185933, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 31300 + }, + { + "epoch": 0.31301, + "grad_norm": 0.7945486445307969, + "learning_rate": 0.003, + "loss": 4.049, + "step": 31301 + }, + { + "epoch": 0.31302, + "grad_norm": 0.775141095416031, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 31302 + }, + { + "epoch": 0.31303, + "grad_norm": 0.7344316537050568, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 31303 + }, + { + "epoch": 0.31304, + "grad_norm": 0.6895561453083252, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 31304 + }, + { + "epoch": 0.31305, + "grad_norm": 0.6137799858184082, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 31305 + }, + { + "epoch": 0.31306, + "grad_norm": 0.5876729994086001, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 31306 + }, + { + "epoch": 0.31307, + "grad_norm": 0.6486700873521241, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 31307 + }, + { + "epoch": 0.31308, + "grad_norm": 0.8280697092738462, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 31308 + }, + { + "epoch": 0.31309, + "grad_norm": 1.0774758805619744, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 31309 + }, + { + "epoch": 0.3131, + "grad_norm": 0.9639864570939809, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 31310 + }, + { + "epoch": 0.31311, + "grad_norm": 0.9502332172816403, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 31311 + }, + { + "epoch": 0.31312, + "grad_norm": 0.8051434649452919, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 31312 + }, + { + "epoch": 0.31313, + "grad_norm": 0.7693556210282144, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 31313 + }, + { + "epoch": 0.31314, + "grad_norm": 0.7739304624622881, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 31314 + }, + { + "epoch": 0.31315, + "grad_norm": 0.7606170307872977, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 31315 + }, + { + "epoch": 0.31316, + "grad_norm": 0.6844505762800396, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 31316 + }, + { + "epoch": 0.31317, + "grad_norm": 0.6130399972916988, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 31317 + }, + { + "epoch": 0.31318, + "grad_norm": 0.7236328680917914, + "learning_rate": 0.003, + "loss": 4.04, + "step": 31318 + }, + { + "epoch": 0.31319, + "grad_norm": 0.7993100556949229, + "learning_rate": 0.003, + "loss": 4.035, + "step": 31319 + }, + { + "epoch": 0.3132, + "grad_norm": 0.8802625181505642, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 31320 + }, + { + "epoch": 0.31321, + "grad_norm": 1.0427957942283264, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 31321 + }, + { + "epoch": 0.31322, + "grad_norm": 1.0347633189559249, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 31322 + }, + { + "epoch": 0.31323, + "grad_norm": 0.9435485380390768, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 31323 + }, + { + "epoch": 0.31324, + "grad_norm": 0.8526402921642833, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 31324 + }, + { + "epoch": 0.31325, + "grad_norm": 0.754478319854867, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 31325 + }, + { + "epoch": 0.31326, + "grad_norm": 0.7605091076587739, + "learning_rate": 0.003, + "loss": 4.026, + "step": 31326 + }, + { + "epoch": 0.31327, + "grad_norm": 0.7794250897314938, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 31327 + }, + { + "epoch": 0.31328, + "grad_norm": 0.7747243589301236, + "learning_rate": 0.003, + "loss": 4.014, + "step": 31328 + }, + { + "epoch": 0.31329, + "grad_norm": 0.7159520310027352, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 31329 + }, + { + "epoch": 0.3133, + "grad_norm": 0.7498574333971377, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 31330 + }, + { + "epoch": 0.31331, + "grad_norm": 1.052734786898398, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 31331 + }, + { + "epoch": 0.31332, + "grad_norm": 1.2403455237449617, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 31332 + }, + { + "epoch": 0.31333, + "grad_norm": 0.7784936556918506, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 31333 + }, + { + "epoch": 0.31334, + "grad_norm": 0.7403412533370851, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 31334 + }, + { + "epoch": 0.31335, + "grad_norm": 0.6015598539407172, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 31335 + }, + { + "epoch": 0.31336, + "grad_norm": 0.6002501411964667, + "learning_rate": 0.003, + "loss": 3.9818, + "step": 31336 + }, + { + "epoch": 0.31337, + "grad_norm": 0.5394909110768534, + "learning_rate": 0.003, + "loss": 4.023, + "step": 31337 + }, + { + "epoch": 0.31338, + "grad_norm": 0.6333523676523857, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 31338 + }, + { + "epoch": 0.31339, + "grad_norm": 0.6723572323852043, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 31339 + }, + { + "epoch": 0.3134, + "grad_norm": 0.7907891248600707, + "learning_rate": 0.003, + "loss": 4.025, + "step": 31340 + }, + { + "epoch": 0.31341, + "grad_norm": 0.9621131493802776, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 31341 + }, + { + "epoch": 0.31342, + "grad_norm": 1.2218285291185964, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 31342 + }, + { + "epoch": 0.31343, + "grad_norm": 0.6238641310899566, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 31343 + }, + { + "epoch": 0.31344, + "grad_norm": 0.7013203843762227, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 31344 + }, + { + "epoch": 0.31345, + "grad_norm": 0.9347271466660745, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 31345 + }, + { + "epoch": 0.31346, + "grad_norm": 0.962401973270663, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 31346 + }, + { + "epoch": 0.31347, + "grad_norm": 1.0224042412482492, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 31347 + }, + { + "epoch": 0.31348, + "grad_norm": 0.8483883267665707, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 31348 + }, + { + "epoch": 0.31349, + "grad_norm": 0.9609667244956539, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 31349 + }, + { + "epoch": 0.3135, + "grad_norm": 1.0923026456566096, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 31350 + }, + { + "epoch": 0.31351, + "grad_norm": 0.9497211163935165, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 31351 + }, + { + "epoch": 0.31352, + "grad_norm": 0.8982547828393189, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 31352 + }, + { + "epoch": 0.31353, + "grad_norm": 0.9820001779664211, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 31353 + }, + { + "epoch": 0.31354, + "grad_norm": 1.1332154795043357, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 31354 + }, + { + "epoch": 0.31355, + "grad_norm": 0.8913804617812923, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 31355 + }, + { + "epoch": 0.31356, + "grad_norm": 0.8221166609024061, + "learning_rate": 0.003, + "loss": 4.074, + "step": 31356 + }, + { + "epoch": 0.31357, + "grad_norm": 0.753767145691851, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 31357 + }, + { + "epoch": 0.31358, + "grad_norm": 0.79083961407806, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 31358 + }, + { + "epoch": 0.31359, + "grad_norm": 0.8455862674734812, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 31359 + }, + { + "epoch": 0.3136, + "grad_norm": 0.9338287553085705, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 31360 + }, + { + "epoch": 0.31361, + "grad_norm": 0.9725815363711021, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 31361 + }, + { + "epoch": 0.31362, + "grad_norm": 0.914073242595534, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 31362 + }, + { + "epoch": 0.31363, + "grad_norm": 0.9431780368673159, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 31363 + }, + { + "epoch": 0.31364, + "grad_norm": 1.0955915714574764, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 31364 + }, + { + "epoch": 0.31365, + "grad_norm": 1.1354548314149375, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 31365 + }, + { + "epoch": 0.31366, + "grad_norm": 1.134973051382539, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 31366 + }, + { + "epoch": 0.31367, + "grad_norm": 0.913351339990013, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 31367 + }, + { + "epoch": 0.31368, + "grad_norm": 1.0151679853784419, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 31368 + }, + { + "epoch": 0.31369, + "grad_norm": 1.0273397406669742, + "learning_rate": 0.003, + "loss": 4.034, + "step": 31369 + }, + { + "epoch": 0.3137, + "grad_norm": 0.9063926345709336, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 31370 + }, + { + "epoch": 0.31371, + "grad_norm": 0.8502527095425136, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 31371 + }, + { + "epoch": 0.31372, + "grad_norm": 0.7350002874353218, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 31372 + }, + { + "epoch": 0.31373, + "grad_norm": 0.7174568885419096, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 31373 + }, + { + "epoch": 0.31374, + "grad_norm": 0.6706716131851381, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 31374 + }, + { + "epoch": 0.31375, + "grad_norm": 0.6521920639968554, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 31375 + }, + { + "epoch": 0.31376, + "grad_norm": 0.7508854223153066, + "learning_rate": 0.003, + "loss": 4.048, + "step": 31376 + }, + { + "epoch": 0.31377, + "grad_norm": 0.7877383284749523, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 31377 + }, + { + "epoch": 0.31378, + "grad_norm": 0.7892395256029134, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 31378 + }, + { + "epoch": 0.31379, + "grad_norm": 0.9024115584641496, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 31379 + }, + { + "epoch": 0.3138, + "grad_norm": 1.171589472978926, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 31380 + }, + { + "epoch": 0.31381, + "grad_norm": 0.8466237237895282, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 31381 + }, + { + "epoch": 0.31382, + "grad_norm": 0.853061371886525, + "learning_rate": 0.003, + "loss": 4.023, + "step": 31382 + }, + { + "epoch": 0.31383, + "grad_norm": 0.8244918145519741, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 31383 + }, + { + "epoch": 0.31384, + "grad_norm": 0.7181811248919989, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 31384 + }, + { + "epoch": 0.31385, + "grad_norm": 0.7575242694894838, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 31385 + }, + { + "epoch": 0.31386, + "grad_norm": 0.7421622516906904, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 31386 + }, + { + "epoch": 0.31387, + "grad_norm": 0.7192331649009231, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 31387 + }, + { + "epoch": 0.31388, + "grad_norm": 0.7294987276373842, + "learning_rate": 0.003, + "loss": 4.05, + "step": 31388 + }, + { + "epoch": 0.31389, + "grad_norm": 0.813634071889361, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 31389 + }, + { + "epoch": 0.3139, + "grad_norm": 0.9210888950879994, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 31390 + }, + { + "epoch": 0.31391, + "grad_norm": 1.009942818579309, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 31391 + }, + { + "epoch": 0.31392, + "grad_norm": 0.9483903261278875, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 31392 + }, + { + "epoch": 0.31393, + "grad_norm": 0.8298731427920731, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 31393 + }, + { + "epoch": 0.31394, + "grad_norm": 0.6811203514394315, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 31394 + }, + { + "epoch": 0.31395, + "grad_norm": 0.5603691076603762, + "learning_rate": 0.003, + "loss": 4.017, + "step": 31395 + }, + { + "epoch": 0.31396, + "grad_norm": 0.521281192037141, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 31396 + }, + { + "epoch": 0.31397, + "grad_norm": 0.5571842269041183, + "learning_rate": 0.003, + "loss": 4.0029, + "step": 31397 + }, + { + "epoch": 0.31398, + "grad_norm": 0.6374310179287845, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 31398 + }, + { + "epoch": 0.31399, + "grad_norm": 0.8125331790677132, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 31399 + }, + { + "epoch": 0.314, + "grad_norm": 0.9703823346408125, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 31400 + }, + { + "epoch": 0.31401, + "grad_norm": 1.0501379222586775, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 31401 + }, + { + "epoch": 0.31402, + "grad_norm": 0.9714619607347084, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 31402 + }, + { + "epoch": 0.31403, + "grad_norm": 0.9256726775508305, + "learning_rate": 0.003, + "loss": 4.025, + "step": 31403 + }, + { + "epoch": 0.31404, + "grad_norm": 0.795905295187092, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 31404 + }, + { + "epoch": 0.31405, + "grad_norm": 0.89964271728939, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 31405 + }, + { + "epoch": 0.31406, + "grad_norm": 0.9999650074767067, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 31406 + }, + { + "epoch": 0.31407, + "grad_norm": 0.8732106738754914, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 31407 + }, + { + "epoch": 0.31408, + "grad_norm": 0.847237204840054, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 31408 + }, + { + "epoch": 0.31409, + "grad_norm": 0.8503949654555671, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 31409 + }, + { + "epoch": 0.3141, + "grad_norm": 0.8297222005689434, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 31410 + }, + { + "epoch": 0.31411, + "grad_norm": 0.8473503787406031, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 31411 + }, + { + "epoch": 0.31412, + "grad_norm": 0.8836119325416425, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 31412 + }, + { + "epoch": 0.31413, + "grad_norm": 0.9517630284095333, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 31413 + }, + { + "epoch": 0.31414, + "grad_norm": 1.0168015190097535, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 31414 + }, + { + "epoch": 0.31415, + "grad_norm": 0.9835037996817881, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 31415 + }, + { + "epoch": 0.31416, + "grad_norm": 1.0423382382293924, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 31416 + }, + { + "epoch": 0.31417, + "grad_norm": 0.897022464956856, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 31417 + }, + { + "epoch": 0.31418, + "grad_norm": 0.8391750410098842, + "learning_rate": 0.003, + "loss": 4.042, + "step": 31418 + }, + { + "epoch": 0.31419, + "grad_norm": 0.8343248748529174, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 31419 + }, + { + "epoch": 0.3142, + "grad_norm": 0.7319467478558885, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 31420 + }, + { + "epoch": 0.31421, + "grad_norm": 0.7183003951678939, + "learning_rate": 0.003, + "loss": 4.041, + "step": 31421 + }, + { + "epoch": 0.31422, + "grad_norm": 0.797679549076921, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 31422 + }, + { + "epoch": 0.31423, + "grad_norm": 0.8881840391071272, + "learning_rate": 0.003, + "loss": 4.064, + "step": 31423 + }, + { + "epoch": 0.31424, + "grad_norm": 1.0936482438892645, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 31424 + }, + { + "epoch": 0.31425, + "grad_norm": 1.0100483402725782, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 31425 + }, + { + "epoch": 0.31426, + "grad_norm": 1.0107734770963108, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 31426 + }, + { + "epoch": 0.31427, + "grad_norm": 0.9211092698513981, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 31427 + }, + { + "epoch": 0.31428, + "grad_norm": 0.9601652131337872, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 31428 + }, + { + "epoch": 0.31429, + "grad_norm": 1.0346837898253092, + "learning_rate": 0.003, + "loss": 4.053, + "step": 31429 + }, + { + "epoch": 0.3143, + "grad_norm": 0.9513651162331734, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 31430 + }, + { + "epoch": 0.31431, + "grad_norm": 0.9944376760722243, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 31431 + }, + { + "epoch": 0.31432, + "grad_norm": 0.894472493022726, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 31432 + }, + { + "epoch": 0.31433, + "grad_norm": 0.7707950061818188, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 31433 + }, + { + "epoch": 0.31434, + "grad_norm": 0.683570533087187, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 31434 + }, + { + "epoch": 0.31435, + "grad_norm": 0.643016121055002, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 31435 + }, + { + "epoch": 0.31436, + "grad_norm": 0.6764699271120648, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 31436 + }, + { + "epoch": 0.31437, + "grad_norm": 0.7016730352478527, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 31437 + }, + { + "epoch": 0.31438, + "grad_norm": 0.7445921125786251, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 31438 + }, + { + "epoch": 0.31439, + "grad_norm": 0.7367089619603503, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 31439 + }, + { + "epoch": 0.3144, + "grad_norm": 0.7968192598453699, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 31440 + }, + { + "epoch": 0.31441, + "grad_norm": 0.9196416964761496, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 31441 + }, + { + "epoch": 0.31442, + "grad_norm": 0.8869221902427373, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 31442 + }, + { + "epoch": 0.31443, + "grad_norm": 0.95563549757164, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 31443 + }, + { + "epoch": 0.31444, + "grad_norm": 1.1535323369859023, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 31444 + }, + { + "epoch": 0.31445, + "grad_norm": 0.7788533204274352, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 31445 + }, + { + "epoch": 0.31446, + "grad_norm": 0.7646667621677142, + "learning_rate": 0.003, + "loss": 4.047, + "step": 31446 + }, + { + "epoch": 0.31447, + "grad_norm": 0.7268424360155621, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 31447 + }, + { + "epoch": 0.31448, + "grad_norm": 0.7051990406057544, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 31448 + }, + { + "epoch": 0.31449, + "grad_norm": 0.7667178976639589, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 31449 + }, + { + "epoch": 0.3145, + "grad_norm": 0.8811877091248324, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 31450 + }, + { + "epoch": 0.31451, + "grad_norm": 1.0606932796869641, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 31451 + }, + { + "epoch": 0.31452, + "grad_norm": 1.138061525280638, + "learning_rate": 0.003, + "loss": 4.068, + "step": 31452 + }, + { + "epoch": 0.31453, + "grad_norm": 0.8392617322165471, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 31453 + }, + { + "epoch": 0.31454, + "grad_norm": 0.7315933226870922, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 31454 + }, + { + "epoch": 0.31455, + "grad_norm": 0.7784931612914113, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 31455 + }, + { + "epoch": 0.31456, + "grad_norm": 0.8374383070017554, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 31456 + }, + { + "epoch": 0.31457, + "grad_norm": 0.7732751663438211, + "learning_rate": 0.003, + "loss": 4.035, + "step": 31457 + }, + { + "epoch": 0.31458, + "grad_norm": 0.8336727542963562, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 31458 + }, + { + "epoch": 0.31459, + "grad_norm": 0.6695288860098132, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 31459 + }, + { + "epoch": 0.3146, + "grad_norm": 0.586430971045819, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 31460 + }, + { + "epoch": 0.31461, + "grad_norm": 0.5760345328379618, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 31461 + }, + { + "epoch": 0.31462, + "grad_norm": 0.5948458999678835, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 31462 + }, + { + "epoch": 0.31463, + "grad_norm": 0.6832035380620919, + "learning_rate": 0.003, + "loss": 4.0025, + "step": 31463 + }, + { + "epoch": 0.31464, + "grad_norm": 0.7349941852892047, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 31464 + }, + { + "epoch": 0.31465, + "grad_norm": 0.7016872208467747, + "learning_rate": 0.003, + "loss": 4.009, + "step": 31465 + }, + { + "epoch": 0.31466, + "grad_norm": 0.8373583991049042, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 31466 + }, + { + "epoch": 0.31467, + "grad_norm": 0.9971947763460353, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 31467 + }, + { + "epoch": 0.31468, + "grad_norm": 1.154593159726765, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 31468 + }, + { + "epoch": 0.31469, + "grad_norm": 1.1229316779199499, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 31469 + }, + { + "epoch": 0.3147, + "grad_norm": 1.004531307478109, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 31470 + }, + { + "epoch": 0.31471, + "grad_norm": 0.9982594618187117, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 31471 + }, + { + "epoch": 0.31472, + "grad_norm": 1.007724904172985, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 31472 + }, + { + "epoch": 0.31473, + "grad_norm": 0.9706938559902623, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 31473 + }, + { + "epoch": 0.31474, + "grad_norm": 0.9560595090167119, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 31474 + }, + { + "epoch": 0.31475, + "grad_norm": 0.9221668082250363, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 31475 + }, + { + "epoch": 0.31476, + "grad_norm": 0.9926479435376707, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 31476 + }, + { + "epoch": 0.31477, + "grad_norm": 0.872413118652345, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 31477 + }, + { + "epoch": 0.31478, + "grad_norm": 0.8372374862555823, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 31478 + }, + { + "epoch": 0.31479, + "grad_norm": 0.8425253151980219, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 31479 + }, + { + "epoch": 0.3148, + "grad_norm": 0.8929528330898638, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 31480 + }, + { + "epoch": 0.31481, + "grad_norm": 1.0401647240304162, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 31481 + }, + { + "epoch": 0.31482, + "grad_norm": 1.0840534674223608, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 31482 + }, + { + "epoch": 0.31483, + "grad_norm": 0.7192967506448255, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 31483 + }, + { + "epoch": 0.31484, + "grad_norm": 0.7798674481374122, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 31484 + }, + { + "epoch": 0.31485, + "grad_norm": 0.8925212821814577, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 31485 + }, + { + "epoch": 0.31486, + "grad_norm": 1.138140324830462, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 31486 + }, + { + "epoch": 0.31487, + "grad_norm": 1.0242470426103574, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 31487 + }, + { + "epoch": 0.31488, + "grad_norm": 0.8917224795240775, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 31488 + }, + { + "epoch": 0.31489, + "grad_norm": 0.7611276213842967, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 31489 + }, + { + "epoch": 0.3149, + "grad_norm": 0.684214619592837, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 31490 + }, + { + "epoch": 0.31491, + "grad_norm": 0.6561065239291558, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 31491 + }, + { + "epoch": 0.31492, + "grad_norm": 0.6690341569632381, + "learning_rate": 0.003, + "loss": 4.045, + "step": 31492 + }, + { + "epoch": 0.31493, + "grad_norm": 0.6222014100675942, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 31493 + }, + { + "epoch": 0.31494, + "grad_norm": 0.5653210687020029, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 31494 + }, + { + "epoch": 0.31495, + "grad_norm": 0.5630389222470825, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 31495 + }, + { + "epoch": 0.31496, + "grad_norm": 0.5825271161347437, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 31496 + }, + { + "epoch": 0.31497, + "grad_norm": 0.6419650109019417, + "learning_rate": 0.003, + "loss": 4.047, + "step": 31497 + }, + { + "epoch": 0.31498, + "grad_norm": 0.8426120560624978, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 31498 + }, + { + "epoch": 0.31499, + "grad_norm": 1.0765552105008858, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 31499 + }, + { + "epoch": 0.315, + "grad_norm": 1.0981563508760492, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 31500 + }, + { + "epoch": 0.31501, + "grad_norm": 0.8168940954774574, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 31501 + }, + { + "epoch": 0.31502, + "grad_norm": 0.6916149314608744, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 31502 + }, + { + "epoch": 0.31503, + "grad_norm": 0.6821972870452591, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 31503 + }, + { + "epoch": 0.31504, + "grad_norm": 0.6927482358310412, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 31504 + }, + { + "epoch": 0.31505, + "grad_norm": 0.6185613111391783, + "learning_rate": 0.003, + "loss": 4.051, + "step": 31505 + }, + { + "epoch": 0.31506, + "grad_norm": 0.5968993551486497, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 31506 + }, + { + "epoch": 0.31507, + "grad_norm": 0.6079414972176316, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 31507 + }, + { + "epoch": 0.31508, + "grad_norm": 0.6260245227724134, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 31508 + }, + { + "epoch": 0.31509, + "grad_norm": 0.8187426271740212, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 31509 + }, + { + "epoch": 0.3151, + "grad_norm": 1.1473446948935555, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 31510 + }, + { + "epoch": 0.31511, + "grad_norm": 0.9626633620945743, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 31511 + }, + { + "epoch": 0.31512, + "grad_norm": 0.6863010312824587, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 31512 + }, + { + "epoch": 0.31513, + "grad_norm": 0.6480230887291982, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 31513 + }, + { + "epoch": 0.31514, + "grad_norm": 0.7782447658659731, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 31514 + }, + { + "epoch": 0.31515, + "grad_norm": 0.9154335692282332, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 31515 + }, + { + "epoch": 0.31516, + "grad_norm": 0.9776740823729358, + "learning_rate": 0.003, + "loss": 4.054, + "step": 31516 + }, + { + "epoch": 0.31517, + "grad_norm": 0.8324537101275667, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 31517 + }, + { + "epoch": 0.31518, + "grad_norm": 0.8101574605261932, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 31518 + }, + { + "epoch": 0.31519, + "grad_norm": 0.868121041458849, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 31519 + }, + { + "epoch": 0.3152, + "grad_norm": 0.9715844219118202, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 31520 + }, + { + "epoch": 0.31521, + "grad_norm": 0.8990374450057724, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 31521 + }, + { + "epoch": 0.31522, + "grad_norm": 1.0474237364989, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 31522 + }, + { + "epoch": 0.31523, + "grad_norm": 1.1190412437367427, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 31523 + }, + { + "epoch": 0.31524, + "grad_norm": 0.8823892293574602, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 31524 + }, + { + "epoch": 0.31525, + "grad_norm": 0.8257457363939992, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 31525 + }, + { + "epoch": 0.31526, + "grad_norm": 0.8759215257229253, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 31526 + }, + { + "epoch": 0.31527, + "grad_norm": 0.974700153371874, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 31527 + }, + { + "epoch": 0.31528, + "grad_norm": 1.0986262461602914, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 31528 + }, + { + "epoch": 0.31529, + "grad_norm": 0.7514007730805605, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 31529 + }, + { + "epoch": 0.3153, + "grad_norm": 0.7451664883802264, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 31530 + }, + { + "epoch": 0.31531, + "grad_norm": 0.7750083124644541, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 31531 + }, + { + "epoch": 0.31532, + "grad_norm": 0.8015763184808444, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 31532 + }, + { + "epoch": 0.31533, + "grad_norm": 0.8780765317590714, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 31533 + }, + { + "epoch": 0.31534, + "grad_norm": 1.0553633125108448, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 31534 + }, + { + "epoch": 0.31535, + "grad_norm": 1.1709473349607877, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 31535 + }, + { + "epoch": 0.31536, + "grad_norm": 0.9446207601406597, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 31536 + }, + { + "epoch": 0.31537, + "grad_norm": 0.8886430581539335, + "learning_rate": 0.003, + "loss": 4.021, + "step": 31537 + }, + { + "epoch": 0.31538, + "grad_norm": 0.8050184923219098, + "learning_rate": 0.003, + "loss": 4.022, + "step": 31538 + }, + { + "epoch": 0.31539, + "grad_norm": 0.8238695859243506, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 31539 + }, + { + "epoch": 0.3154, + "grad_norm": 0.8014267068429404, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 31540 + }, + { + "epoch": 0.31541, + "grad_norm": 0.8558140754638787, + "learning_rate": 0.003, + "loss": 4.056, + "step": 31541 + }, + { + "epoch": 0.31542, + "grad_norm": 0.8078201320536111, + "learning_rate": 0.003, + "loss": 3.9982, + "step": 31542 + }, + { + "epoch": 0.31543, + "grad_norm": 0.7317805602344942, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 31543 + }, + { + "epoch": 0.31544, + "grad_norm": 0.711560996956267, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 31544 + }, + { + "epoch": 0.31545, + "grad_norm": 0.8254289502078649, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 31545 + }, + { + "epoch": 0.31546, + "grad_norm": 0.8947727967870772, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 31546 + }, + { + "epoch": 0.31547, + "grad_norm": 0.9048624298124993, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 31547 + }, + { + "epoch": 0.31548, + "grad_norm": 0.9418557247184818, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 31548 + }, + { + "epoch": 0.31549, + "grad_norm": 0.920059445591814, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 31549 + }, + { + "epoch": 0.3155, + "grad_norm": 0.8567144024422263, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 31550 + }, + { + "epoch": 0.31551, + "grad_norm": 0.8506463099646814, + "learning_rate": 0.003, + "loss": 4.059, + "step": 31551 + }, + { + "epoch": 0.31552, + "grad_norm": 0.9609793592118786, + "learning_rate": 0.003, + "loss": 4.034, + "step": 31552 + }, + { + "epoch": 0.31553, + "grad_norm": 1.301669821468197, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 31553 + }, + { + "epoch": 0.31554, + "grad_norm": 0.8756560890625696, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 31554 + }, + { + "epoch": 0.31555, + "grad_norm": 0.7554566068095743, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 31555 + }, + { + "epoch": 0.31556, + "grad_norm": 0.7526543188687029, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 31556 + }, + { + "epoch": 0.31557, + "grad_norm": 0.8330966220631019, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 31557 + }, + { + "epoch": 0.31558, + "grad_norm": 0.9124007294069366, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 31558 + }, + { + "epoch": 0.31559, + "grad_norm": 0.9659274000695888, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 31559 + }, + { + "epoch": 0.3156, + "grad_norm": 0.9999265484823098, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 31560 + }, + { + "epoch": 0.31561, + "grad_norm": 0.9582478299715528, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 31561 + }, + { + "epoch": 0.31562, + "grad_norm": 0.9060478913375641, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 31562 + }, + { + "epoch": 0.31563, + "grad_norm": 0.8325520195295539, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 31563 + }, + { + "epoch": 0.31564, + "grad_norm": 0.7389213534346812, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 31564 + }, + { + "epoch": 0.31565, + "grad_norm": 0.7146758572090178, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 31565 + }, + { + "epoch": 0.31566, + "grad_norm": 0.747967947045861, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 31566 + }, + { + "epoch": 0.31567, + "grad_norm": 0.7005297479117018, + "learning_rate": 0.003, + "loss": 4.051, + "step": 31567 + }, + { + "epoch": 0.31568, + "grad_norm": 0.6254137617205997, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 31568 + }, + { + "epoch": 0.31569, + "grad_norm": 0.6590887087229161, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 31569 + }, + { + "epoch": 0.3157, + "grad_norm": 0.6481777506187365, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 31570 + }, + { + "epoch": 0.31571, + "grad_norm": 0.6754959180795943, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 31571 + }, + { + "epoch": 0.31572, + "grad_norm": 0.8033778993739903, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 31572 + }, + { + "epoch": 0.31573, + "grad_norm": 0.9251453559014015, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 31573 + }, + { + "epoch": 0.31574, + "grad_norm": 1.0766641364846352, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 31574 + }, + { + "epoch": 0.31575, + "grad_norm": 1.1876597270526545, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 31575 + }, + { + "epoch": 0.31576, + "grad_norm": 0.9450521358017108, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 31576 + }, + { + "epoch": 0.31577, + "grad_norm": 0.877600090883553, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 31577 + }, + { + "epoch": 0.31578, + "grad_norm": 0.8551764731496907, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 31578 + }, + { + "epoch": 0.31579, + "grad_norm": 0.8068423121888187, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 31579 + }, + { + "epoch": 0.3158, + "grad_norm": 0.8439504832750326, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 31580 + }, + { + "epoch": 0.31581, + "grad_norm": 0.9088579821771007, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 31581 + }, + { + "epoch": 0.31582, + "grad_norm": 1.0079748329243585, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 31582 + }, + { + "epoch": 0.31583, + "grad_norm": 1.0290257633094382, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 31583 + }, + { + "epoch": 0.31584, + "grad_norm": 0.8130658427301913, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 31584 + }, + { + "epoch": 0.31585, + "grad_norm": 0.6804106040802933, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 31585 + }, + { + "epoch": 0.31586, + "grad_norm": 0.6643786630886397, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 31586 + }, + { + "epoch": 0.31587, + "grad_norm": 0.65495435499212, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 31587 + }, + { + "epoch": 0.31588, + "grad_norm": 0.6342785492568463, + "learning_rate": 0.003, + "loss": 4.043, + "step": 31588 + }, + { + "epoch": 0.31589, + "grad_norm": 0.6313748263619188, + "learning_rate": 0.003, + "loss": 4.023, + "step": 31589 + }, + { + "epoch": 0.3159, + "grad_norm": 0.6787853360862045, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 31590 + }, + { + "epoch": 0.31591, + "grad_norm": 0.7657142999190774, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 31591 + }, + { + "epoch": 0.31592, + "grad_norm": 1.0232697747908814, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 31592 + }, + { + "epoch": 0.31593, + "grad_norm": 1.2400184879033547, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 31593 + }, + { + "epoch": 0.31594, + "grad_norm": 0.929521238600925, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 31594 + }, + { + "epoch": 0.31595, + "grad_norm": 0.8651041277415614, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 31595 + }, + { + "epoch": 0.31596, + "grad_norm": 0.7952548300946874, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 31596 + }, + { + "epoch": 0.31597, + "grad_norm": 0.7351706087981932, + "learning_rate": 0.003, + "loss": 4.05, + "step": 31597 + }, + { + "epoch": 0.31598, + "grad_norm": 0.7099837993513373, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 31598 + }, + { + "epoch": 0.31599, + "grad_norm": 0.6841804804460871, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 31599 + }, + { + "epoch": 0.316, + "grad_norm": 0.7081385041857993, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 31600 + }, + { + "epoch": 0.31601, + "grad_norm": 0.9570912725399182, + "learning_rate": 0.003, + "loss": 4.031, + "step": 31601 + }, + { + "epoch": 0.31602, + "grad_norm": 1.240250894516626, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 31602 + }, + { + "epoch": 0.31603, + "grad_norm": 0.8134500309928108, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 31603 + }, + { + "epoch": 0.31604, + "grad_norm": 0.6839302874059167, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 31604 + }, + { + "epoch": 0.31605, + "grad_norm": 0.7260660295926408, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 31605 + }, + { + "epoch": 0.31606, + "grad_norm": 0.7245199102326637, + "learning_rate": 0.003, + "loss": 3.9995, + "step": 31606 + }, + { + "epoch": 0.31607, + "grad_norm": 0.7668130856449261, + "learning_rate": 0.003, + "loss": 4.033, + "step": 31607 + }, + { + "epoch": 0.31608, + "grad_norm": 0.7958127175057887, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 31608 + }, + { + "epoch": 0.31609, + "grad_norm": 0.834865414299694, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 31609 + }, + { + "epoch": 0.3161, + "grad_norm": 0.7847391831492195, + "learning_rate": 0.003, + "loss": 4.036, + "step": 31610 + }, + { + "epoch": 0.31611, + "grad_norm": 0.704113949103065, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 31611 + }, + { + "epoch": 0.31612, + "grad_norm": 0.8075935574503997, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 31612 + }, + { + "epoch": 0.31613, + "grad_norm": 1.0637410204692654, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 31613 + }, + { + "epoch": 0.31614, + "grad_norm": 1.2567720941913594, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 31614 + }, + { + "epoch": 0.31615, + "grad_norm": 0.8725355820747278, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 31615 + }, + { + "epoch": 0.31616, + "grad_norm": 0.8549274490791835, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 31616 + }, + { + "epoch": 0.31617, + "grad_norm": 0.9214495021492208, + "learning_rate": 0.003, + "loss": 4.056, + "step": 31617 + }, + { + "epoch": 0.31618, + "grad_norm": 1.0149862847536824, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 31618 + }, + { + "epoch": 0.31619, + "grad_norm": 0.907353275277867, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 31619 + }, + { + "epoch": 0.3162, + "grad_norm": 0.904757399125384, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 31620 + }, + { + "epoch": 0.31621, + "grad_norm": 0.9865243249951052, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 31621 + }, + { + "epoch": 0.31622, + "grad_norm": 0.9569986916542709, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 31622 + }, + { + "epoch": 0.31623, + "grad_norm": 0.9371445823565684, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 31623 + }, + { + "epoch": 0.31624, + "grad_norm": 0.9607636681326829, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 31624 + }, + { + "epoch": 0.31625, + "grad_norm": 0.9508530903650089, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 31625 + }, + { + "epoch": 0.31626, + "grad_norm": 0.8889351590108432, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 31626 + }, + { + "epoch": 0.31627, + "grad_norm": 0.7349610638343067, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 31627 + }, + { + "epoch": 0.31628, + "grad_norm": 0.6856110587766746, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 31628 + }, + { + "epoch": 0.31629, + "grad_norm": 0.6822003746599821, + "learning_rate": 0.003, + "loss": 4.027, + "step": 31629 + }, + { + "epoch": 0.3163, + "grad_norm": 0.691430913414624, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 31630 + }, + { + "epoch": 0.31631, + "grad_norm": 0.8317875291135071, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 31631 + }, + { + "epoch": 0.31632, + "grad_norm": 1.0335665785727393, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 31632 + }, + { + "epoch": 0.31633, + "grad_norm": 1.351428069233235, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 31633 + }, + { + "epoch": 0.31634, + "grad_norm": 0.6805819769020331, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 31634 + }, + { + "epoch": 0.31635, + "grad_norm": 0.6294144043689914, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 31635 + }, + { + "epoch": 0.31636, + "grad_norm": 0.7170792948388789, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 31636 + }, + { + "epoch": 0.31637, + "grad_norm": 0.6110946813975363, + "learning_rate": 0.003, + "loss": 4.031, + "step": 31637 + }, + { + "epoch": 0.31638, + "grad_norm": 0.5552248375990422, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 31638 + }, + { + "epoch": 0.31639, + "grad_norm": 0.5417346708688879, + "learning_rate": 0.003, + "loss": 3.997, + "step": 31639 + }, + { + "epoch": 0.3164, + "grad_norm": 0.5981597260356412, + "learning_rate": 0.003, + "loss": 4.0101, + "step": 31640 + }, + { + "epoch": 0.31641, + "grad_norm": 0.6544517110030807, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 31641 + }, + { + "epoch": 0.31642, + "grad_norm": 0.6904413697582259, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 31642 + }, + { + "epoch": 0.31643, + "grad_norm": 0.8899234276497832, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 31643 + }, + { + "epoch": 0.31644, + "grad_norm": 1.2369451691829019, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 31644 + }, + { + "epoch": 0.31645, + "grad_norm": 0.7487791474959529, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 31645 + }, + { + "epoch": 0.31646, + "grad_norm": 0.6083826103279368, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 31646 + }, + { + "epoch": 0.31647, + "grad_norm": 0.64825076375314, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 31647 + }, + { + "epoch": 0.31648, + "grad_norm": 0.6822954676765187, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 31648 + }, + { + "epoch": 0.31649, + "grad_norm": 0.784412459404125, + "learning_rate": 0.003, + "loss": 4.015, + "step": 31649 + }, + { + "epoch": 0.3165, + "grad_norm": 0.9886739316691603, + "learning_rate": 0.003, + "loss": 4.018, + "step": 31650 + }, + { + "epoch": 0.31651, + "grad_norm": 1.3003920899010992, + "learning_rate": 0.003, + "loss": 4.057, + "step": 31651 + }, + { + "epoch": 0.31652, + "grad_norm": 0.6703372930904052, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 31652 + }, + { + "epoch": 0.31653, + "grad_norm": 0.6941063937268447, + "learning_rate": 0.003, + "loss": 4.017, + "step": 31653 + }, + { + "epoch": 0.31654, + "grad_norm": 0.6938753940432911, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 31654 + }, + { + "epoch": 0.31655, + "grad_norm": 0.6649660841946136, + "learning_rate": 0.003, + "loss": 3.9934, + "step": 31655 + }, + { + "epoch": 0.31656, + "grad_norm": 0.6470272695669527, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 31656 + }, + { + "epoch": 0.31657, + "grad_norm": 0.6891802057526383, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 31657 + }, + { + "epoch": 0.31658, + "grad_norm": 0.7514515597838847, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 31658 + }, + { + "epoch": 0.31659, + "grad_norm": 0.8235411986104749, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 31659 + }, + { + "epoch": 0.3166, + "grad_norm": 0.8367570328730097, + "learning_rate": 0.003, + "loss": 3.998, + "step": 31660 + }, + { + "epoch": 0.31661, + "grad_norm": 0.7828977411965163, + "learning_rate": 0.003, + "loss": 4.015, + "step": 31661 + }, + { + "epoch": 0.31662, + "grad_norm": 0.8224378610466981, + "learning_rate": 0.003, + "loss": 4.026, + "step": 31662 + }, + { + "epoch": 0.31663, + "grad_norm": 0.9459133092552918, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 31663 + }, + { + "epoch": 0.31664, + "grad_norm": 1.1968785427206308, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 31664 + }, + { + "epoch": 0.31665, + "grad_norm": 1.004923488105499, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 31665 + }, + { + "epoch": 0.31666, + "grad_norm": 1.034715300979389, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 31666 + }, + { + "epoch": 0.31667, + "grad_norm": 1.1687606828565789, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 31667 + }, + { + "epoch": 0.31668, + "grad_norm": 0.951580207007303, + "learning_rate": 0.003, + "loss": 4.044, + "step": 31668 + }, + { + "epoch": 0.31669, + "grad_norm": 0.8574852377520135, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 31669 + }, + { + "epoch": 0.3167, + "grad_norm": 0.8214861522461784, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 31670 + }, + { + "epoch": 0.31671, + "grad_norm": 0.8266482803256938, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 31671 + }, + { + "epoch": 0.31672, + "grad_norm": 0.8304211857262456, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 31672 + }, + { + "epoch": 0.31673, + "grad_norm": 0.848662824027803, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 31673 + }, + { + "epoch": 0.31674, + "grad_norm": 0.8509902943356481, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 31674 + }, + { + "epoch": 0.31675, + "grad_norm": 0.9661900118811518, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 31675 + }, + { + "epoch": 0.31676, + "grad_norm": 1.1948854276986531, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 31676 + }, + { + "epoch": 0.31677, + "grad_norm": 0.9194235524734188, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 31677 + }, + { + "epoch": 0.31678, + "grad_norm": 0.9207504793325362, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 31678 + }, + { + "epoch": 0.31679, + "grad_norm": 0.9561254133682046, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 31679 + }, + { + "epoch": 0.3168, + "grad_norm": 1.1280401166332852, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 31680 + }, + { + "epoch": 0.31681, + "grad_norm": 0.950069108278576, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 31681 + }, + { + "epoch": 0.31682, + "grad_norm": 0.8097316415915052, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 31682 + }, + { + "epoch": 0.31683, + "grad_norm": 0.8423696212864596, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 31683 + }, + { + "epoch": 0.31684, + "grad_norm": 0.8574379662117941, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 31684 + }, + { + "epoch": 0.31685, + "grad_norm": 0.9072313518922669, + "learning_rate": 0.003, + "loss": 4.054, + "step": 31685 + }, + { + "epoch": 0.31686, + "grad_norm": 0.8724464981229997, + "learning_rate": 0.003, + "loss": 4.061, + "step": 31686 + }, + { + "epoch": 0.31687, + "grad_norm": 0.7467563046873948, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 31687 + }, + { + "epoch": 0.31688, + "grad_norm": 0.7994163277917888, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 31688 + }, + { + "epoch": 0.31689, + "grad_norm": 0.857007422215465, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 31689 + }, + { + "epoch": 0.3169, + "grad_norm": 0.9664597742598159, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 31690 + }, + { + "epoch": 0.31691, + "grad_norm": 1.0617823139984794, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 31691 + }, + { + "epoch": 0.31692, + "grad_norm": 1.0896529161154136, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 31692 + }, + { + "epoch": 0.31693, + "grad_norm": 0.8921868647076213, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 31693 + }, + { + "epoch": 0.31694, + "grad_norm": 0.8021453358951197, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 31694 + }, + { + "epoch": 0.31695, + "grad_norm": 0.6973597093958827, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 31695 + }, + { + "epoch": 0.31696, + "grad_norm": 0.6969098857510676, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 31696 + }, + { + "epoch": 0.31697, + "grad_norm": 0.6990828159371222, + "learning_rate": 0.003, + "loss": 4.013, + "step": 31697 + }, + { + "epoch": 0.31698, + "grad_norm": 0.7132234232793425, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 31698 + }, + { + "epoch": 0.31699, + "grad_norm": 0.7674043000928494, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 31699 + }, + { + "epoch": 0.317, + "grad_norm": 0.8127487960959816, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 31700 + }, + { + "epoch": 0.31701, + "grad_norm": 0.8636070818618966, + "learning_rate": 0.003, + "loss": 4.059, + "step": 31701 + }, + { + "epoch": 0.31702, + "grad_norm": 0.8575853197039052, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 31702 + }, + { + "epoch": 0.31703, + "grad_norm": 0.8145277062983434, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 31703 + }, + { + "epoch": 0.31704, + "grad_norm": 0.727773256028851, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 31704 + }, + { + "epoch": 0.31705, + "grad_norm": 0.8249551611424246, + "learning_rate": 0.003, + "loss": 4.051, + "step": 31705 + }, + { + "epoch": 0.31706, + "grad_norm": 1.1589169527104535, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 31706 + }, + { + "epoch": 0.31707, + "grad_norm": 0.9803638921902325, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 31707 + }, + { + "epoch": 0.31708, + "grad_norm": 0.8400111912969246, + "learning_rate": 0.003, + "loss": 4.033, + "step": 31708 + }, + { + "epoch": 0.31709, + "grad_norm": 0.7685439607886513, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 31709 + }, + { + "epoch": 0.3171, + "grad_norm": 0.6850874966656303, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 31710 + }, + { + "epoch": 0.31711, + "grad_norm": 0.6137210311218297, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 31711 + }, + { + "epoch": 0.31712, + "grad_norm": 0.6164414020722567, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 31712 + }, + { + "epoch": 0.31713, + "grad_norm": 0.6596844960028684, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 31713 + }, + { + "epoch": 0.31714, + "grad_norm": 0.7848467065290823, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 31714 + }, + { + "epoch": 0.31715, + "grad_norm": 0.8593193421150901, + "learning_rate": 0.003, + "loss": 4.018, + "step": 31715 + }, + { + "epoch": 0.31716, + "grad_norm": 0.955574776903467, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 31716 + }, + { + "epoch": 0.31717, + "grad_norm": 1.0859192160420044, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 31717 + }, + { + "epoch": 0.31718, + "grad_norm": 1.0733369936776542, + "learning_rate": 0.003, + "loss": 4.046, + "step": 31718 + }, + { + "epoch": 0.31719, + "grad_norm": 0.9977448488604509, + "learning_rate": 0.003, + "loss": 4.007, + "step": 31719 + }, + { + "epoch": 0.3172, + "grad_norm": 0.9332232947103541, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 31720 + }, + { + "epoch": 0.31721, + "grad_norm": 0.82974840898276, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 31721 + }, + { + "epoch": 0.31722, + "grad_norm": 0.7352922255500388, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 31722 + }, + { + "epoch": 0.31723, + "grad_norm": 0.7733070717397945, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 31723 + }, + { + "epoch": 0.31724, + "grad_norm": 0.8369714710907813, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 31724 + }, + { + "epoch": 0.31725, + "grad_norm": 0.9619466767659887, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 31725 + }, + { + "epoch": 0.31726, + "grad_norm": 1.0139836647152107, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 31726 + }, + { + "epoch": 0.31727, + "grad_norm": 0.9424026051218393, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 31727 + }, + { + "epoch": 0.31728, + "grad_norm": 1.0059392783450427, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 31728 + }, + { + "epoch": 0.31729, + "grad_norm": 1.0422474170309832, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 31729 + }, + { + "epoch": 0.3173, + "grad_norm": 0.8777005901796452, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 31730 + }, + { + "epoch": 0.31731, + "grad_norm": 0.7515284097710908, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 31731 + }, + { + "epoch": 0.31732, + "grad_norm": 0.6774220962361013, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 31732 + }, + { + "epoch": 0.31733, + "grad_norm": 0.604393727124694, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 31733 + }, + { + "epoch": 0.31734, + "grad_norm": 0.71149275639189, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 31734 + }, + { + "epoch": 0.31735, + "grad_norm": 0.7857232795347587, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 31735 + }, + { + "epoch": 0.31736, + "grad_norm": 0.8808998443633108, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 31736 + }, + { + "epoch": 0.31737, + "grad_norm": 1.0460501392505324, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 31737 + }, + { + "epoch": 0.31738, + "grad_norm": 1.0058761512516605, + "learning_rate": 0.003, + "loss": 4.04, + "step": 31738 + }, + { + "epoch": 0.31739, + "grad_norm": 0.9262393520806653, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 31739 + }, + { + "epoch": 0.3174, + "grad_norm": 0.8754055895703489, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 31740 + }, + { + "epoch": 0.31741, + "grad_norm": 0.8153754084106215, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 31741 + }, + { + "epoch": 0.31742, + "grad_norm": 0.8107919854134675, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 31742 + }, + { + "epoch": 0.31743, + "grad_norm": 0.824600599144597, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 31743 + }, + { + "epoch": 0.31744, + "grad_norm": 0.7925837642496514, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 31744 + }, + { + "epoch": 0.31745, + "grad_norm": 0.9702372691210485, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 31745 + }, + { + "epoch": 0.31746, + "grad_norm": 1.0954310675584324, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 31746 + }, + { + "epoch": 0.31747, + "grad_norm": 0.9881123968325731, + "learning_rate": 0.003, + "loss": 4.0009, + "step": 31747 + }, + { + "epoch": 0.31748, + "grad_norm": 1.0108546815993849, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 31748 + }, + { + "epoch": 0.31749, + "grad_norm": 1.0209991262149867, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 31749 + }, + { + "epoch": 0.3175, + "grad_norm": 0.9782786457242102, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 31750 + }, + { + "epoch": 0.31751, + "grad_norm": 0.8836964357966485, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 31751 + }, + { + "epoch": 0.31752, + "grad_norm": 0.8419562556047677, + "learning_rate": 0.003, + "loss": 4.043, + "step": 31752 + }, + { + "epoch": 0.31753, + "grad_norm": 0.9554254109110003, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 31753 + }, + { + "epoch": 0.31754, + "grad_norm": 1.0496575472030754, + "learning_rate": 0.003, + "loss": 3.9964, + "step": 31754 + }, + { + "epoch": 0.31755, + "grad_norm": 1.046292446016317, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 31755 + }, + { + "epoch": 0.31756, + "grad_norm": 0.7713437348006492, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 31756 + }, + { + "epoch": 0.31757, + "grad_norm": 0.7420067650435133, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 31757 + }, + { + "epoch": 0.31758, + "grad_norm": 0.7617284070458882, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 31758 + }, + { + "epoch": 0.31759, + "grad_norm": 0.7398636782303852, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 31759 + }, + { + "epoch": 0.3176, + "grad_norm": 0.7378628819663552, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 31760 + }, + { + "epoch": 0.31761, + "grad_norm": 0.7508560010749952, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 31761 + }, + { + "epoch": 0.31762, + "grad_norm": 0.6717340930663201, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 31762 + }, + { + "epoch": 0.31763, + "grad_norm": 0.6503279374981161, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 31763 + }, + { + "epoch": 0.31764, + "grad_norm": 0.7061339129080761, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 31764 + }, + { + "epoch": 0.31765, + "grad_norm": 0.6467245609740618, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 31765 + }, + { + "epoch": 0.31766, + "grad_norm": 0.5553141776217694, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 31766 + }, + { + "epoch": 0.31767, + "grad_norm": 0.5794638145098655, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 31767 + }, + { + "epoch": 0.31768, + "grad_norm": 0.62262483503146, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 31768 + }, + { + "epoch": 0.31769, + "grad_norm": 0.5634303803936263, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 31769 + }, + { + "epoch": 0.3177, + "grad_norm": 0.6668720426893265, + "learning_rate": 0.003, + "loss": 4.0041, + "step": 31770 + }, + { + "epoch": 0.31771, + "grad_norm": 0.963613265686622, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 31771 + }, + { + "epoch": 0.31772, + "grad_norm": 1.3017730927257694, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 31772 + }, + { + "epoch": 0.31773, + "grad_norm": 0.6333041936758826, + "learning_rate": 0.003, + "loss": 4.045, + "step": 31773 + }, + { + "epoch": 0.31774, + "grad_norm": 0.7171855708505732, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 31774 + }, + { + "epoch": 0.31775, + "grad_norm": 0.6895554748952585, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 31775 + }, + { + "epoch": 0.31776, + "grad_norm": 0.6684608967333775, + "learning_rate": 0.003, + "loss": 4.017, + "step": 31776 + }, + { + "epoch": 0.31777, + "grad_norm": 0.6404235195527236, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 31777 + }, + { + "epoch": 0.31778, + "grad_norm": 0.5941257335083212, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 31778 + }, + { + "epoch": 0.31779, + "grad_norm": 0.5392608318340594, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 31779 + }, + { + "epoch": 0.3178, + "grad_norm": 0.540607353715306, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 31780 + }, + { + "epoch": 0.31781, + "grad_norm": 0.6447232723666853, + "learning_rate": 0.003, + "loss": 4.004, + "step": 31781 + }, + { + "epoch": 0.31782, + "grad_norm": 0.688999424955781, + "learning_rate": 0.003, + "loss": 4.026, + "step": 31782 + }, + { + "epoch": 0.31783, + "grad_norm": 0.6956078300118124, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 31783 + }, + { + "epoch": 0.31784, + "grad_norm": 0.7476707106380601, + "learning_rate": 0.003, + "loss": 3.9988, + "step": 31784 + }, + { + "epoch": 0.31785, + "grad_norm": 1.014231075078544, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 31785 + }, + { + "epoch": 0.31786, + "grad_norm": 1.2479250128178547, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 31786 + }, + { + "epoch": 0.31787, + "grad_norm": 0.928681568221827, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 31787 + }, + { + "epoch": 0.31788, + "grad_norm": 1.0137099070039748, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 31788 + }, + { + "epoch": 0.31789, + "grad_norm": 0.9075292268362357, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 31789 + }, + { + "epoch": 0.3179, + "grad_norm": 0.9179471014236559, + "learning_rate": 0.003, + "loss": 4.027, + "step": 31790 + }, + { + "epoch": 0.31791, + "grad_norm": 0.9303417786314597, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 31791 + }, + { + "epoch": 0.31792, + "grad_norm": 0.9137408217415345, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 31792 + }, + { + "epoch": 0.31793, + "grad_norm": 0.9165941561414078, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 31793 + }, + { + "epoch": 0.31794, + "grad_norm": 0.9661702796667364, + "learning_rate": 0.003, + "loss": 4.081, + "step": 31794 + }, + { + "epoch": 0.31795, + "grad_norm": 0.9775097566264104, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 31795 + }, + { + "epoch": 0.31796, + "grad_norm": 1.1157457502233663, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 31796 + }, + { + "epoch": 0.31797, + "grad_norm": 0.8856904998154366, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 31797 + }, + { + "epoch": 0.31798, + "grad_norm": 0.8519214561708793, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 31798 + }, + { + "epoch": 0.31799, + "grad_norm": 1.0307727318815474, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 31799 + }, + { + "epoch": 0.318, + "grad_norm": 1.1260676327763377, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 31800 + }, + { + "epoch": 0.31801, + "grad_norm": 0.9716022797439507, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 31801 + }, + { + "epoch": 0.31802, + "grad_norm": 0.9371381317882487, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 31802 + }, + { + "epoch": 0.31803, + "grad_norm": 0.9031837590848479, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 31803 + }, + { + "epoch": 0.31804, + "grad_norm": 0.8394145265131439, + "learning_rate": 0.003, + "loss": 4.041, + "step": 31804 + }, + { + "epoch": 0.31805, + "grad_norm": 0.7248972748404832, + "learning_rate": 0.003, + "loss": 4.025, + "step": 31805 + }, + { + "epoch": 0.31806, + "grad_norm": 0.682553450623044, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 31806 + }, + { + "epoch": 0.31807, + "grad_norm": 0.7222777840702896, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 31807 + }, + { + "epoch": 0.31808, + "grad_norm": 0.8005529487674533, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 31808 + }, + { + "epoch": 0.31809, + "grad_norm": 0.8115304921841447, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 31809 + }, + { + "epoch": 0.3181, + "grad_norm": 0.7669719454365862, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 31810 + }, + { + "epoch": 0.31811, + "grad_norm": 0.7895201875934068, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 31811 + }, + { + "epoch": 0.31812, + "grad_norm": 0.8927677112580112, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 31812 + }, + { + "epoch": 0.31813, + "grad_norm": 0.9568060675741136, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 31813 + }, + { + "epoch": 0.31814, + "grad_norm": 0.9118318764611992, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 31814 + }, + { + "epoch": 0.31815, + "grad_norm": 0.8634885176463749, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 31815 + }, + { + "epoch": 0.31816, + "grad_norm": 0.8958948540650317, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 31816 + }, + { + "epoch": 0.31817, + "grad_norm": 0.8108622053475567, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 31817 + }, + { + "epoch": 0.31818, + "grad_norm": 0.8957666022651664, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 31818 + }, + { + "epoch": 0.31819, + "grad_norm": 1.0827120044309515, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 31819 + }, + { + "epoch": 0.3182, + "grad_norm": 1.0211789663115969, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 31820 + }, + { + "epoch": 0.31821, + "grad_norm": 1.1339129467277016, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 31821 + }, + { + "epoch": 0.31822, + "grad_norm": 0.892129634986221, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 31822 + }, + { + "epoch": 0.31823, + "grad_norm": 0.7482746469833818, + "learning_rate": 0.003, + "loss": 4.027, + "step": 31823 + }, + { + "epoch": 0.31824, + "grad_norm": 0.6882227874567665, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 31824 + }, + { + "epoch": 0.31825, + "grad_norm": 0.6982827864579789, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 31825 + }, + { + "epoch": 0.31826, + "grad_norm": 0.6531882392468825, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 31826 + }, + { + "epoch": 0.31827, + "grad_norm": 0.5800531556713253, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 31827 + }, + { + "epoch": 0.31828, + "grad_norm": 0.6042458106764691, + "learning_rate": 0.003, + "loss": 3.998, + "step": 31828 + }, + { + "epoch": 0.31829, + "grad_norm": 0.6422986811910036, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 31829 + }, + { + "epoch": 0.3183, + "grad_norm": 0.6823041011347539, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 31830 + }, + { + "epoch": 0.31831, + "grad_norm": 0.8486270033535357, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 31831 + }, + { + "epoch": 0.31832, + "grad_norm": 1.1410196262946537, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 31832 + }, + { + "epoch": 0.31833, + "grad_norm": 0.7835169421778341, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 31833 + }, + { + "epoch": 0.31834, + "grad_norm": 0.6837822303249259, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 31834 + }, + { + "epoch": 0.31835, + "grad_norm": 0.7745801809607402, + "learning_rate": 0.003, + "loss": 4.0045, + "step": 31835 + }, + { + "epoch": 0.31836, + "grad_norm": 0.8617406093622209, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 31836 + }, + { + "epoch": 0.31837, + "grad_norm": 1.1386860868326663, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 31837 + }, + { + "epoch": 0.31838, + "grad_norm": 0.9835903219772802, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 31838 + }, + { + "epoch": 0.31839, + "grad_norm": 0.9166239009127802, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 31839 + }, + { + "epoch": 0.3184, + "grad_norm": 0.8592565015918057, + "learning_rate": 0.003, + "loss": 4.063, + "step": 31840 + }, + { + "epoch": 0.31841, + "grad_norm": 0.7758546505070536, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 31841 + }, + { + "epoch": 0.31842, + "grad_norm": 0.6805929767663431, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 31842 + }, + { + "epoch": 0.31843, + "grad_norm": 0.8033730908955115, + "learning_rate": 0.003, + "loss": 4.045, + "step": 31843 + }, + { + "epoch": 0.31844, + "grad_norm": 0.8200219170428044, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 31844 + }, + { + "epoch": 0.31845, + "grad_norm": 0.878433440330917, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 31845 + }, + { + "epoch": 0.31846, + "grad_norm": 1.0453824215515883, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 31846 + }, + { + "epoch": 0.31847, + "grad_norm": 1.0026640689331277, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 31847 + }, + { + "epoch": 0.31848, + "grad_norm": 0.8783832208326118, + "learning_rate": 0.003, + "loss": 4.072, + "step": 31848 + }, + { + "epoch": 0.31849, + "grad_norm": 0.9088781825564077, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 31849 + }, + { + "epoch": 0.3185, + "grad_norm": 0.8797457873021886, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 31850 + }, + { + "epoch": 0.31851, + "grad_norm": 0.7624455043807403, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 31851 + }, + { + "epoch": 0.31852, + "grad_norm": 0.7776255760667412, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 31852 + }, + { + "epoch": 0.31853, + "grad_norm": 1.1551997077015292, + "learning_rate": 0.003, + "loss": 4.072, + "step": 31853 + }, + { + "epoch": 0.31854, + "grad_norm": 1.250925158135599, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 31854 + }, + { + "epoch": 0.31855, + "grad_norm": 0.7086022777841936, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 31855 + }, + { + "epoch": 0.31856, + "grad_norm": 0.7351087587614781, + "learning_rate": 0.003, + "loss": 4.027, + "step": 31856 + }, + { + "epoch": 0.31857, + "grad_norm": 0.7266546245266704, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 31857 + }, + { + "epoch": 0.31858, + "grad_norm": 0.744009910017858, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 31858 + }, + { + "epoch": 0.31859, + "grad_norm": 0.7809740345878033, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 31859 + }, + { + "epoch": 0.3186, + "grad_norm": 0.7778039148190007, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 31860 + }, + { + "epoch": 0.31861, + "grad_norm": 0.7150061711429323, + "learning_rate": 0.003, + "loss": 3.9965, + "step": 31861 + }, + { + "epoch": 0.31862, + "grad_norm": 0.644799035742603, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 31862 + }, + { + "epoch": 0.31863, + "grad_norm": 0.6126949802215979, + "learning_rate": 0.003, + "loss": 4.035, + "step": 31863 + }, + { + "epoch": 0.31864, + "grad_norm": 0.636018137498393, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 31864 + }, + { + "epoch": 0.31865, + "grad_norm": 0.7569269767137659, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 31865 + }, + { + "epoch": 0.31866, + "grad_norm": 0.9729876961048175, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 31866 + }, + { + "epoch": 0.31867, + "grad_norm": 1.1953277764355161, + "learning_rate": 0.003, + "loss": 4.077, + "step": 31867 + }, + { + "epoch": 0.31868, + "grad_norm": 0.7253513343920129, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 31868 + }, + { + "epoch": 0.31869, + "grad_norm": 0.7241339929545165, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 31869 + }, + { + "epoch": 0.3187, + "grad_norm": 0.7606228143120478, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 31870 + }, + { + "epoch": 0.31871, + "grad_norm": 1.0278337448483394, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 31871 + }, + { + "epoch": 0.31872, + "grad_norm": 1.148506643356017, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 31872 + }, + { + "epoch": 0.31873, + "grad_norm": 0.8803192898809925, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 31873 + }, + { + "epoch": 0.31874, + "grad_norm": 0.935921801561099, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 31874 + }, + { + "epoch": 0.31875, + "grad_norm": 0.8294268018534112, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 31875 + }, + { + "epoch": 0.31876, + "grad_norm": 0.870504744694578, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 31876 + }, + { + "epoch": 0.31877, + "grad_norm": 0.9819034957616358, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 31877 + }, + { + "epoch": 0.31878, + "grad_norm": 1.2297107231441387, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 31878 + }, + { + "epoch": 0.31879, + "grad_norm": 0.7977295834568685, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 31879 + }, + { + "epoch": 0.3188, + "grad_norm": 0.9444682432242328, + "learning_rate": 0.003, + "loss": 4.072, + "step": 31880 + }, + { + "epoch": 0.31881, + "grad_norm": 0.9603971893538169, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 31881 + }, + { + "epoch": 0.31882, + "grad_norm": 0.9146825209963074, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 31882 + }, + { + "epoch": 0.31883, + "grad_norm": 0.9947232258316071, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 31883 + }, + { + "epoch": 0.31884, + "grad_norm": 1.0714214145737804, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 31884 + }, + { + "epoch": 0.31885, + "grad_norm": 0.8610196917402594, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 31885 + }, + { + "epoch": 0.31886, + "grad_norm": 0.7501043688858714, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 31886 + }, + { + "epoch": 0.31887, + "grad_norm": 0.8532363873883472, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 31887 + }, + { + "epoch": 0.31888, + "grad_norm": 0.9179891817940065, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 31888 + }, + { + "epoch": 0.31889, + "grad_norm": 0.8141065635257053, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 31889 + }, + { + "epoch": 0.3189, + "grad_norm": 0.7740498787229562, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 31890 + }, + { + "epoch": 0.31891, + "grad_norm": 0.726347293847583, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 31891 + }, + { + "epoch": 0.31892, + "grad_norm": 0.682267283870932, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 31892 + }, + { + "epoch": 0.31893, + "grad_norm": 0.7880407825475153, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 31893 + }, + { + "epoch": 0.31894, + "grad_norm": 0.7971800775877619, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 31894 + }, + { + "epoch": 0.31895, + "grad_norm": 0.7432508734361885, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 31895 + }, + { + "epoch": 0.31896, + "grad_norm": 0.8030601661175567, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 31896 + }, + { + "epoch": 0.31897, + "grad_norm": 0.7918071698226915, + "learning_rate": 0.003, + "loss": 4.057, + "step": 31897 + }, + { + "epoch": 0.31898, + "grad_norm": 0.7323831223777469, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 31898 + }, + { + "epoch": 0.31899, + "grad_norm": 0.6252765559134325, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 31899 + }, + { + "epoch": 0.319, + "grad_norm": 0.6237808591411749, + "learning_rate": 0.003, + "loss": 3.9988, + "step": 31900 + }, + { + "epoch": 0.31901, + "grad_norm": 0.6316253014947295, + "learning_rate": 0.003, + "loss": 4.033, + "step": 31901 + }, + { + "epoch": 0.31902, + "grad_norm": 0.6599030080662733, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 31902 + }, + { + "epoch": 0.31903, + "grad_norm": 0.7019997153506355, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 31903 + }, + { + "epoch": 0.31904, + "grad_norm": 1.0287377635358812, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 31904 + }, + { + "epoch": 0.31905, + "grad_norm": 1.4557453400171543, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 31905 + }, + { + "epoch": 0.31906, + "grad_norm": 0.4650270959278082, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 31906 + }, + { + "epoch": 0.31907, + "grad_norm": 0.9024237864340765, + "learning_rate": 0.003, + "loss": 4.068, + "step": 31907 + }, + { + "epoch": 0.31908, + "grad_norm": 1.0455128937368943, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 31908 + }, + { + "epoch": 0.31909, + "grad_norm": 0.909997028275504, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 31909 + }, + { + "epoch": 0.3191, + "grad_norm": 0.8968564276745242, + "learning_rate": 0.003, + "loss": 4.053, + "step": 31910 + }, + { + "epoch": 0.31911, + "grad_norm": 0.8784667503563353, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 31911 + }, + { + "epoch": 0.31912, + "grad_norm": 0.7921184072901972, + "learning_rate": 0.003, + "loss": 4.0047, + "step": 31912 + }, + { + "epoch": 0.31913, + "grad_norm": 0.6897666211196851, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 31913 + }, + { + "epoch": 0.31914, + "grad_norm": 0.647771668483397, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 31914 + }, + { + "epoch": 0.31915, + "grad_norm": 0.7268458730937448, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 31915 + }, + { + "epoch": 0.31916, + "grad_norm": 0.845508764329991, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 31916 + }, + { + "epoch": 0.31917, + "grad_norm": 1.1616235133580697, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 31917 + }, + { + "epoch": 0.31918, + "grad_norm": 0.9374671244888748, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 31918 + }, + { + "epoch": 0.31919, + "grad_norm": 0.7963473771294272, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 31919 + }, + { + "epoch": 0.3192, + "grad_norm": 0.7485844335205557, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 31920 + }, + { + "epoch": 0.31921, + "grad_norm": 0.766799389457276, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 31921 + }, + { + "epoch": 0.31922, + "grad_norm": 0.7377989774261214, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 31922 + }, + { + "epoch": 0.31923, + "grad_norm": 0.7098106538463989, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 31923 + }, + { + "epoch": 0.31924, + "grad_norm": 0.6852794619243266, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 31924 + }, + { + "epoch": 0.31925, + "grad_norm": 0.8898638814841711, + "learning_rate": 0.003, + "loss": 4.041, + "step": 31925 + }, + { + "epoch": 0.31926, + "grad_norm": 1.1011865426529608, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 31926 + }, + { + "epoch": 0.31927, + "grad_norm": 0.9805539855453584, + "learning_rate": 0.003, + "loss": 4.02, + "step": 31927 + }, + { + "epoch": 0.31928, + "grad_norm": 0.9390630574992901, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 31928 + }, + { + "epoch": 0.31929, + "grad_norm": 0.8820689706047953, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 31929 + }, + { + "epoch": 0.3193, + "grad_norm": 0.966870621329203, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 31930 + }, + { + "epoch": 0.31931, + "grad_norm": 0.816414801442235, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 31931 + }, + { + "epoch": 0.31932, + "grad_norm": 0.7819685647382922, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 31932 + }, + { + "epoch": 0.31933, + "grad_norm": 0.7446792355799742, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 31933 + }, + { + "epoch": 0.31934, + "grad_norm": 0.7310166143431301, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 31934 + }, + { + "epoch": 0.31935, + "grad_norm": 0.742723102377141, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 31935 + }, + { + "epoch": 0.31936, + "grad_norm": 0.8960097343645532, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 31936 + }, + { + "epoch": 0.31937, + "grad_norm": 1.1144163132169604, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 31937 + }, + { + "epoch": 0.31938, + "grad_norm": 1.166727186821888, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 31938 + }, + { + "epoch": 0.31939, + "grad_norm": 0.8322959088244993, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 31939 + }, + { + "epoch": 0.3194, + "grad_norm": 0.6499166343754063, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 31940 + }, + { + "epoch": 0.31941, + "grad_norm": 0.6762872507013268, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 31941 + }, + { + "epoch": 0.31942, + "grad_norm": 0.7360632590097111, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 31942 + }, + { + "epoch": 0.31943, + "grad_norm": 0.8995858736423183, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 31943 + }, + { + "epoch": 0.31944, + "grad_norm": 0.9958386505954763, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 31944 + }, + { + "epoch": 0.31945, + "grad_norm": 1.0567270409490133, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 31945 + }, + { + "epoch": 0.31946, + "grad_norm": 0.7778097893841952, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 31946 + }, + { + "epoch": 0.31947, + "grad_norm": 0.7347048817329094, + "learning_rate": 0.003, + "loss": 4.038, + "step": 31947 + }, + { + "epoch": 0.31948, + "grad_norm": 0.751947048501362, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 31948 + }, + { + "epoch": 0.31949, + "grad_norm": 0.7483546704206897, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 31949 + }, + { + "epoch": 0.3195, + "grad_norm": 0.8985366964581385, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 31950 + }, + { + "epoch": 0.31951, + "grad_norm": 1.1174277150462961, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 31951 + }, + { + "epoch": 0.31952, + "grad_norm": 0.783229845699387, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 31952 + }, + { + "epoch": 0.31953, + "grad_norm": 0.7086187501330949, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 31953 + }, + { + "epoch": 0.31954, + "grad_norm": 0.753200063198879, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 31954 + }, + { + "epoch": 0.31955, + "grad_norm": 0.7130914308031854, + "learning_rate": 0.003, + "loss": 4.037, + "step": 31955 + }, + { + "epoch": 0.31956, + "grad_norm": 0.715177737475376, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 31956 + }, + { + "epoch": 0.31957, + "grad_norm": 0.9306238073941161, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 31957 + }, + { + "epoch": 0.31958, + "grad_norm": 1.1424727918683524, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 31958 + }, + { + "epoch": 0.31959, + "grad_norm": 0.7984644889502183, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 31959 + }, + { + "epoch": 0.3196, + "grad_norm": 0.7409439322315312, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 31960 + }, + { + "epoch": 0.31961, + "grad_norm": 0.6104016610407806, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 31961 + }, + { + "epoch": 0.31962, + "grad_norm": 0.6082768451244568, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 31962 + }, + { + "epoch": 0.31963, + "grad_norm": 0.6410355093824731, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 31963 + }, + { + "epoch": 0.31964, + "grad_norm": 0.6514978851839643, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 31964 + }, + { + "epoch": 0.31965, + "grad_norm": 0.760157681534477, + "learning_rate": 0.003, + "loss": 4.088, + "step": 31965 + }, + { + "epoch": 0.31966, + "grad_norm": 0.7916998017448065, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 31966 + }, + { + "epoch": 0.31967, + "grad_norm": 0.7310848997868826, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 31967 + }, + { + "epoch": 0.31968, + "grad_norm": 0.758928172355569, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 31968 + }, + { + "epoch": 0.31969, + "grad_norm": 0.7843576241612726, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 31969 + }, + { + "epoch": 0.3197, + "grad_norm": 0.7893309438032472, + "learning_rate": 0.003, + "loss": 4.0007, + "step": 31970 + }, + { + "epoch": 0.31971, + "grad_norm": 0.7623347897582147, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 31971 + }, + { + "epoch": 0.31972, + "grad_norm": 0.7283270775986949, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 31972 + }, + { + "epoch": 0.31973, + "grad_norm": 0.7759202820621282, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 31973 + }, + { + "epoch": 0.31974, + "grad_norm": 0.9569202192117368, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 31974 + }, + { + "epoch": 0.31975, + "grad_norm": 1.1699870708580904, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 31975 + }, + { + "epoch": 0.31976, + "grad_norm": 1.1598244945937317, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 31976 + }, + { + "epoch": 0.31977, + "grad_norm": 1.0714472782539628, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 31977 + }, + { + "epoch": 0.31978, + "grad_norm": 0.9585752891570171, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 31978 + }, + { + "epoch": 0.31979, + "grad_norm": 0.9935764844332361, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 31979 + }, + { + "epoch": 0.3198, + "grad_norm": 0.917970692055569, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 31980 + }, + { + "epoch": 0.31981, + "grad_norm": 0.8948464076222182, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 31981 + }, + { + "epoch": 0.31982, + "grad_norm": 0.9420091259137481, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 31982 + }, + { + "epoch": 0.31983, + "grad_norm": 0.9080774246774516, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 31983 + }, + { + "epoch": 0.31984, + "grad_norm": 0.8882614337397354, + "learning_rate": 0.003, + "loss": 4.041, + "step": 31984 + }, + { + "epoch": 0.31985, + "grad_norm": 0.9289677986427315, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 31985 + }, + { + "epoch": 0.31986, + "grad_norm": 0.9078188305859279, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 31986 + }, + { + "epoch": 0.31987, + "grad_norm": 0.9063446703135518, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 31987 + }, + { + "epoch": 0.31988, + "grad_norm": 0.9184602061902861, + "learning_rate": 0.003, + "loss": 4.051, + "step": 31988 + }, + { + "epoch": 0.31989, + "grad_norm": 0.8558604591744582, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 31989 + }, + { + "epoch": 0.3199, + "grad_norm": 0.8343724839043628, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 31990 + }, + { + "epoch": 0.31991, + "grad_norm": 0.8710951068245222, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 31991 + }, + { + "epoch": 0.31992, + "grad_norm": 0.9902186178784111, + "learning_rate": 0.003, + "loss": 4.052, + "step": 31992 + }, + { + "epoch": 0.31993, + "grad_norm": 1.0243591008733897, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 31993 + }, + { + "epoch": 0.31994, + "grad_norm": 0.957746057148872, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 31994 + }, + { + "epoch": 0.31995, + "grad_norm": 0.9123419283478792, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 31995 + }, + { + "epoch": 0.31996, + "grad_norm": 1.0690989717824446, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 31996 + }, + { + "epoch": 0.31997, + "grad_norm": 1.0593332050725643, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 31997 + }, + { + "epoch": 0.31998, + "grad_norm": 0.9381075966847782, + "learning_rate": 0.003, + "loss": 4.066, + "step": 31998 + }, + { + "epoch": 0.31999, + "grad_norm": 1.027614119195915, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 31999 + }, + { + "epoch": 0.32, + "grad_norm": 0.9937547999191643, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 32000 + }, + { + "epoch": 0.32001, + "grad_norm": 0.8905727663611102, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 32001 + }, + { + "epoch": 0.32002, + "grad_norm": 0.9283239791206751, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 32002 + }, + { + "epoch": 0.32003, + "grad_norm": 1.0712964649809305, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 32003 + }, + { + "epoch": 0.32004, + "grad_norm": 0.9842351369792838, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 32004 + }, + { + "epoch": 0.32005, + "grad_norm": 1.0342623115897605, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 32005 + }, + { + "epoch": 0.32006, + "grad_norm": 1.0297046518059014, + "learning_rate": 0.003, + "loss": 4.08, + "step": 32006 + }, + { + "epoch": 0.32007, + "grad_norm": 1.0348028632206066, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 32007 + }, + { + "epoch": 0.32008, + "grad_norm": 0.862864699643544, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 32008 + }, + { + "epoch": 0.32009, + "grad_norm": 0.7739315000977706, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 32009 + }, + { + "epoch": 0.3201, + "grad_norm": 0.7595435751321571, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 32010 + }, + { + "epoch": 0.32011, + "grad_norm": 0.8608867843721222, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 32011 + }, + { + "epoch": 0.32012, + "grad_norm": 0.8328745602274971, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 32012 + }, + { + "epoch": 0.32013, + "grad_norm": 0.7177502500998435, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 32013 + }, + { + "epoch": 0.32014, + "grad_norm": 0.552308721647943, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 32014 + }, + { + "epoch": 0.32015, + "grad_norm": 0.5345347648465099, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 32015 + }, + { + "epoch": 0.32016, + "grad_norm": 0.5636578262000033, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 32016 + }, + { + "epoch": 0.32017, + "grad_norm": 0.6157988675474264, + "learning_rate": 0.003, + "loss": 4.0052, + "step": 32017 + }, + { + "epoch": 0.32018, + "grad_norm": 0.6951204648170406, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 32018 + }, + { + "epoch": 0.32019, + "grad_norm": 0.813312603394071, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 32019 + }, + { + "epoch": 0.3202, + "grad_norm": 0.9478989511370306, + "learning_rate": 0.003, + "loss": 4.039, + "step": 32020 + }, + { + "epoch": 0.32021, + "grad_norm": 1.125575212766914, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 32021 + }, + { + "epoch": 0.32022, + "grad_norm": 0.7704911596245103, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 32022 + }, + { + "epoch": 0.32023, + "grad_norm": 0.6673891044289912, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 32023 + }, + { + "epoch": 0.32024, + "grad_norm": 0.5042939130741306, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 32024 + }, + { + "epoch": 0.32025, + "grad_norm": 0.45621597283630866, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 32025 + }, + { + "epoch": 0.32026, + "grad_norm": 0.454114493943505, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 32026 + }, + { + "epoch": 0.32027, + "grad_norm": 0.4850994891568993, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 32027 + }, + { + "epoch": 0.32028, + "grad_norm": 0.47933033162671634, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 32028 + }, + { + "epoch": 0.32029, + "grad_norm": 0.5415419166312868, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 32029 + }, + { + "epoch": 0.3203, + "grad_norm": 0.5856828794473122, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 32030 + }, + { + "epoch": 0.32031, + "grad_norm": 0.7433198055512185, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 32031 + }, + { + "epoch": 0.32032, + "grad_norm": 1.0725019398351328, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 32032 + }, + { + "epoch": 0.32033, + "grad_norm": 0.9178820247270171, + "learning_rate": 0.003, + "loss": 4.0043, + "step": 32033 + }, + { + "epoch": 0.32034, + "grad_norm": 0.6766540669992053, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 32034 + }, + { + "epoch": 0.32035, + "grad_norm": 0.5881938728460938, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 32035 + }, + { + "epoch": 0.32036, + "grad_norm": 0.7511388806391386, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 32036 + }, + { + "epoch": 0.32037, + "grad_norm": 1.0097299657858265, + "learning_rate": 0.003, + "loss": 4.004, + "step": 32037 + }, + { + "epoch": 0.32038, + "grad_norm": 1.1085428214065751, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 32038 + }, + { + "epoch": 0.32039, + "grad_norm": 0.8595820754968798, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 32039 + }, + { + "epoch": 0.3204, + "grad_norm": 0.8859817443084181, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 32040 + }, + { + "epoch": 0.32041, + "grad_norm": 0.8161877203939851, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 32041 + }, + { + "epoch": 0.32042, + "grad_norm": 0.7726546588569224, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 32042 + }, + { + "epoch": 0.32043, + "grad_norm": 0.8550862182864043, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 32043 + }, + { + "epoch": 0.32044, + "grad_norm": 0.8593122043528518, + "learning_rate": 0.003, + "loss": 4.035, + "step": 32044 + }, + { + "epoch": 0.32045, + "grad_norm": 0.9016315772440688, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 32045 + }, + { + "epoch": 0.32046, + "grad_norm": 0.9641285966818746, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 32046 + }, + { + "epoch": 0.32047, + "grad_norm": 0.8240340616146082, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 32047 + }, + { + "epoch": 0.32048, + "grad_norm": 0.8257327154403905, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 32048 + }, + { + "epoch": 0.32049, + "grad_norm": 0.7431354495647136, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 32049 + }, + { + "epoch": 0.3205, + "grad_norm": 0.8331736631628603, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 32050 + }, + { + "epoch": 0.32051, + "grad_norm": 0.9813366636502296, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 32051 + }, + { + "epoch": 0.32052, + "grad_norm": 1.1221775774811966, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 32052 + }, + { + "epoch": 0.32053, + "grad_norm": 0.8054868597704876, + "learning_rate": 0.003, + "loss": 4.032, + "step": 32053 + }, + { + "epoch": 0.32054, + "grad_norm": 0.706791238466667, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 32054 + }, + { + "epoch": 0.32055, + "grad_norm": 0.6960522115682027, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 32055 + }, + { + "epoch": 0.32056, + "grad_norm": 0.8509576114290087, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 32056 + }, + { + "epoch": 0.32057, + "grad_norm": 1.0794484715577326, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 32057 + }, + { + "epoch": 0.32058, + "grad_norm": 1.111290368615515, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 32058 + }, + { + "epoch": 0.32059, + "grad_norm": 0.9880047681132434, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 32059 + }, + { + "epoch": 0.3206, + "grad_norm": 1.0896215378464862, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 32060 + }, + { + "epoch": 0.32061, + "grad_norm": 0.9015748358552653, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 32061 + }, + { + "epoch": 0.32062, + "grad_norm": 0.8370710672603827, + "learning_rate": 0.003, + "loss": 4.038, + "step": 32062 + }, + { + "epoch": 0.32063, + "grad_norm": 0.6789203115178115, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 32063 + }, + { + "epoch": 0.32064, + "grad_norm": 0.5744165149204089, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 32064 + }, + { + "epoch": 0.32065, + "grad_norm": 0.5923086862454859, + "learning_rate": 0.003, + "loss": 4.043, + "step": 32065 + }, + { + "epoch": 0.32066, + "grad_norm": 0.5769409259527578, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 32066 + }, + { + "epoch": 0.32067, + "grad_norm": 0.5394694322475562, + "learning_rate": 0.003, + "loss": 4.036, + "step": 32067 + }, + { + "epoch": 0.32068, + "grad_norm": 0.6265894815291089, + "learning_rate": 0.003, + "loss": 4.063, + "step": 32068 + }, + { + "epoch": 0.32069, + "grad_norm": 0.7129656932181448, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 32069 + }, + { + "epoch": 0.3207, + "grad_norm": 0.8413683669794733, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 32070 + }, + { + "epoch": 0.32071, + "grad_norm": 1.084772940164286, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 32071 + }, + { + "epoch": 0.32072, + "grad_norm": 1.06087527649488, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 32072 + }, + { + "epoch": 0.32073, + "grad_norm": 1.1109637223846296, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 32073 + }, + { + "epoch": 0.32074, + "grad_norm": 0.8431063578273923, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 32074 + }, + { + "epoch": 0.32075, + "grad_norm": 0.6849595773444788, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 32075 + }, + { + "epoch": 0.32076, + "grad_norm": 0.6116047493926089, + "learning_rate": 0.003, + "loss": 4.014, + "step": 32076 + }, + { + "epoch": 0.32077, + "grad_norm": 0.6902054443464168, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 32077 + }, + { + "epoch": 0.32078, + "grad_norm": 0.7339449794102021, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 32078 + }, + { + "epoch": 0.32079, + "grad_norm": 0.7340248541272907, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 32079 + }, + { + "epoch": 0.3208, + "grad_norm": 0.8109322069769744, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 32080 + }, + { + "epoch": 0.32081, + "grad_norm": 0.85212664842251, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 32081 + }, + { + "epoch": 0.32082, + "grad_norm": 0.9696166006524929, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 32082 + }, + { + "epoch": 0.32083, + "grad_norm": 1.100816221844497, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 32083 + }, + { + "epoch": 0.32084, + "grad_norm": 0.8610194984483136, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 32084 + }, + { + "epoch": 0.32085, + "grad_norm": 0.8404777189188042, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 32085 + }, + { + "epoch": 0.32086, + "grad_norm": 0.8208337509444262, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 32086 + }, + { + "epoch": 0.32087, + "grad_norm": 0.9331090670080582, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 32087 + }, + { + "epoch": 0.32088, + "grad_norm": 1.1745248102925971, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 32088 + }, + { + "epoch": 0.32089, + "grad_norm": 0.90800290391079, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 32089 + }, + { + "epoch": 0.3209, + "grad_norm": 0.803760798579035, + "learning_rate": 0.003, + "loss": 4.0037, + "step": 32090 + }, + { + "epoch": 0.32091, + "grad_norm": 0.7298883312138952, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 32091 + }, + { + "epoch": 0.32092, + "grad_norm": 0.8000463583581157, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 32092 + }, + { + "epoch": 0.32093, + "grad_norm": 1.0098026179270745, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 32093 + }, + { + "epoch": 0.32094, + "grad_norm": 0.9871673356263472, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 32094 + }, + { + "epoch": 0.32095, + "grad_norm": 0.9244930998870973, + "learning_rate": 0.003, + "loss": 4.07, + "step": 32095 + }, + { + "epoch": 0.32096, + "grad_norm": 0.8390389165978216, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 32096 + }, + { + "epoch": 0.32097, + "grad_norm": 0.9046044260634625, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 32097 + }, + { + "epoch": 0.32098, + "grad_norm": 0.9711921274365544, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 32098 + }, + { + "epoch": 0.32099, + "grad_norm": 0.882981613880237, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 32099 + }, + { + "epoch": 0.321, + "grad_norm": 0.9215891946754206, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 32100 + }, + { + "epoch": 0.32101, + "grad_norm": 0.9419135824789112, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 32101 + }, + { + "epoch": 0.32102, + "grad_norm": 0.8762218318332125, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 32102 + }, + { + "epoch": 0.32103, + "grad_norm": 0.9633899480890369, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 32103 + }, + { + "epoch": 0.32104, + "grad_norm": 1.0866969231590793, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 32104 + }, + { + "epoch": 0.32105, + "grad_norm": 0.8925659852896375, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 32105 + }, + { + "epoch": 0.32106, + "grad_norm": 0.8519269665352203, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 32106 + }, + { + "epoch": 0.32107, + "grad_norm": 0.8586792653479003, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 32107 + }, + { + "epoch": 0.32108, + "grad_norm": 0.8868928651513859, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 32108 + }, + { + "epoch": 0.32109, + "grad_norm": 1.0358699712309498, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 32109 + }, + { + "epoch": 0.3211, + "grad_norm": 1.0209306985413853, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 32110 + }, + { + "epoch": 0.32111, + "grad_norm": 1.1961841303130454, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 32111 + }, + { + "epoch": 0.32112, + "grad_norm": 0.8890401509451059, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 32112 + }, + { + "epoch": 0.32113, + "grad_norm": 0.7257698596482696, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 32113 + }, + { + "epoch": 0.32114, + "grad_norm": 0.7600299616133965, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 32114 + }, + { + "epoch": 0.32115, + "grad_norm": 0.8066728727239163, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 32115 + }, + { + "epoch": 0.32116, + "grad_norm": 0.7775506323895115, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 32116 + }, + { + "epoch": 0.32117, + "grad_norm": 0.799621343871532, + "learning_rate": 0.003, + "loss": 4.034, + "step": 32117 + }, + { + "epoch": 0.32118, + "grad_norm": 0.7836591909383303, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 32118 + }, + { + "epoch": 0.32119, + "grad_norm": 0.7575670372501762, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 32119 + }, + { + "epoch": 0.3212, + "grad_norm": 0.6859761449611931, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 32120 + }, + { + "epoch": 0.32121, + "grad_norm": 0.6530805590337593, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 32121 + }, + { + "epoch": 0.32122, + "grad_norm": 0.6686021483111871, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 32122 + }, + { + "epoch": 0.32123, + "grad_norm": 0.6783093662036036, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 32123 + }, + { + "epoch": 0.32124, + "grad_norm": 0.6840653642252628, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 32124 + }, + { + "epoch": 0.32125, + "grad_norm": 0.7274586638600474, + "learning_rate": 0.003, + "loss": 4.055, + "step": 32125 + }, + { + "epoch": 0.32126, + "grad_norm": 0.7482039171449799, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 32126 + }, + { + "epoch": 0.32127, + "grad_norm": 0.748509753445551, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 32127 + }, + { + "epoch": 0.32128, + "grad_norm": 0.7683615222643407, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 32128 + }, + { + "epoch": 0.32129, + "grad_norm": 0.7865344886340901, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 32129 + }, + { + "epoch": 0.3213, + "grad_norm": 0.7955579252103503, + "learning_rate": 0.003, + "loss": 3.9908, + "step": 32130 + }, + { + "epoch": 0.32131, + "grad_norm": 0.8543216598714775, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 32131 + }, + { + "epoch": 0.32132, + "grad_norm": 0.9790653943017321, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 32132 + }, + { + "epoch": 0.32133, + "grad_norm": 1.0591000207726944, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 32133 + }, + { + "epoch": 0.32134, + "grad_norm": 0.905965588264445, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 32134 + }, + { + "epoch": 0.32135, + "grad_norm": 0.7624284926349406, + "learning_rate": 0.003, + "loss": 4.032, + "step": 32135 + }, + { + "epoch": 0.32136, + "grad_norm": 0.8409851542076126, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 32136 + }, + { + "epoch": 0.32137, + "grad_norm": 0.8487114988418342, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 32137 + }, + { + "epoch": 0.32138, + "grad_norm": 0.9158504983676331, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 32138 + }, + { + "epoch": 0.32139, + "grad_norm": 1.0736450333376792, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 32139 + }, + { + "epoch": 0.3214, + "grad_norm": 1.0195628717716176, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 32140 + }, + { + "epoch": 0.32141, + "grad_norm": 0.9963059262121619, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 32141 + }, + { + "epoch": 0.32142, + "grad_norm": 0.9216265487146964, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 32142 + }, + { + "epoch": 0.32143, + "grad_norm": 0.9165688487642132, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 32143 + }, + { + "epoch": 0.32144, + "grad_norm": 0.9263856305943822, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 32144 + }, + { + "epoch": 0.32145, + "grad_norm": 1.0283616564988651, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 32145 + }, + { + "epoch": 0.32146, + "grad_norm": 1.0672700688422256, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 32146 + }, + { + "epoch": 0.32147, + "grad_norm": 1.0271712466774654, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 32147 + }, + { + "epoch": 0.32148, + "grad_norm": 0.9855380513077183, + "learning_rate": 0.003, + "loss": 4.009, + "step": 32148 + }, + { + "epoch": 0.32149, + "grad_norm": 1.0045653473201643, + "learning_rate": 0.003, + "loss": 3.9872, + "step": 32149 + }, + { + "epoch": 0.3215, + "grad_norm": 0.975240632406502, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 32150 + }, + { + "epoch": 0.32151, + "grad_norm": 0.8976111961558524, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 32151 + }, + { + "epoch": 0.32152, + "grad_norm": 0.899894650641608, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 32152 + }, + { + "epoch": 0.32153, + "grad_norm": 0.9430697230794242, + "learning_rate": 0.003, + "loss": 4.053, + "step": 32153 + }, + { + "epoch": 0.32154, + "grad_norm": 0.95495918562649, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 32154 + }, + { + "epoch": 0.32155, + "grad_norm": 0.998530802639755, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 32155 + }, + { + "epoch": 0.32156, + "grad_norm": 1.0339729858959528, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 32156 + }, + { + "epoch": 0.32157, + "grad_norm": 0.8324672480654042, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 32157 + }, + { + "epoch": 0.32158, + "grad_norm": 0.6754175273029288, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 32158 + }, + { + "epoch": 0.32159, + "grad_norm": 0.7061527675052528, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 32159 + }, + { + "epoch": 0.3216, + "grad_norm": 0.682645671276685, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 32160 + }, + { + "epoch": 0.32161, + "grad_norm": 0.7226975796243015, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 32161 + }, + { + "epoch": 0.32162, + "grad_norm": 0.6423839759714424, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 32162 + }, + { + "epoch": 0.32163, + "grad_norm": 0.6171529861686078, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 32163 + }, + { + "epoch": 0.32164, + "grad_norm": 0.5535345037800211, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 32164 + }, + { + "epoch": 0.32165, + "grad_norm": 0.6144168666594476, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 32165 + }, + { + "epoch": 0.32166, + "grad_norm": 0.6993477491479677, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 32166 + }, + { + "epoch": 0.32167, + "grad_norm": 0.8569248077675105, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 32167 + }, + { + "epoch": 0.32168, + "grad_norm": 0.9530898602539767, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 32168 + }, + { + "epoch": 0.32169, + "grad_norm": 0.9685158671700538, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 32169 + }, + { + "epoch": 0.3217, + "grad_norm": 1.036377177468917, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 32170 + }, + { + "epoch": 0.32171, + "grad_norm": 0.971099752842579, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 32171 + }, + { + "epoch": 0.32172, + "grad_norm": 1.005443241965482, + "learning_rate": 0.003, + "loss": 4.072, + "step": 32172 + }, + { + "epoch": 0.32173, + "grad_norm": 0.8936870380582859, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 32173 + }, + { + "epoch": 0.32174, + "grad_norm": 0.8432079011872359, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 32174 + }, + { + "epoch": 0.32175, + "grad_norm": 0.8515503027872529, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 32175 + }, + { + "epoch": 0.32176, + "grad_norm": 0.7896117432751558, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 32176 + }, + { + "epoch": 0.32177, + "grad_norm": 0.8345284469269142, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 32177 + }, + { + "epoch": 0.32178, + "grad_norm": 0.8153377890763066, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 32178 + }, + { + "epoch": 0.32179, + "grad_norm": 0.6914299117044597, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 32179 + }, + { + "epoch": 0.3218, + "grad_norm": 0.6469985704339442, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 32180 + }, + { + "epoch": 0.32181, + "grad_norm": 0.6382553203869028, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 32181 + }, + { + "epoch": 0.32182, + "grad_norm": 0.6481811022517221, + "learning_rate": 0.003, + "loss": 4.033, + "step": 32182 + }, + { + "epoch": 0.32183, + "grad_norm": 0.7689969069219974, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 32183 + }, + { + "epoch": 0.32184, + "grad_norm": 0.9185585922855081, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 32184 + }, + { + "epoch": 0.32185, + "grad_norm": 0.9686786411759901, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 32185 + }, + { + "epoch": 0.32186, + "grad_norm": 0.9153408590140227, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 32186 + }, + { + "epoch": 0.32187, + "grad_norm": 0.7854690370017651, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 32187 + }, + { + "epoch": 0.32188, + "grad_norm": 0.6547452956881445, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 32188 + }, + { + "epoch": 0.32189, + "grad_norm": 0.6018545522644374, + "learning_rate": 0.003, + "loss": 4.025, + "step": 32189 + }, + { + "epoch": 0.3219, + "grad_norm": 0.5654801667369025, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 32190 + }, + { + "epoch": 0.32191, + "grad_norm": 0.5998660720510379, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 32191 + }, + { + "epoch": 0.32192, + "grad_norm": 0.6835821034539953, + "learning_rate": 0.003, + "loss": 4.024, + "step": 32192 + }, + { + "epoch": 0.32193, + "grad_norm": 0.8830123389916543, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 32193 + }, + { + "epoch": 0.32194, + "grad_norm": 1.0605787209653907, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 32194 + }, + { + "epoch": 0.32195, + "grad_norm": 0.8424570982848152, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 32195 + }, + { + "epoch": 0.32196, + "grad_norm": 0.7803156647175802, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 32196 + }, + { + "epoch": 0.32197, + "grad_norm": 0.8265753317923835, + "learning_rate": 0.003, + "loss": 4.058, + "step": 32197 + }, + { + "epoch": 0.32198, + "grad_norm": 0.9310412078309459, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 32198 + }, + { + "epoch": 0.32199, + "grad_norm": 1.0017448024794426, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 32199 + }, + { + "epoch": 0.322, + "grad_norm": 0.9041079476170394, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 32200 + }, + { + "epoch": 0.32201, + "grad_norm": 1.1173892516809483, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 32201 + }, + { + "epoch": 0.32202, + "grad_norm": 1.0807424930459617, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 32202 + }, + { + "epoch": 0.32203, + "grad_norm": 0.9509635180401249, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 32203 + }, + { + "epoch": 0.32204, + "grad_norm": 0.9882575237381347, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 32204 + }, + { + "epoch": 0.32205, + "grad_norm": 0.8903634729363463, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 32205 + }, + { + "epoch": 0.32206, + "grad_norm": 0.715512013027559, + "learning_rate": 0.003, + "loss": 4.062, + "step": 32206 + }, + { + "epoch": 0.32207, + "grad_norm": 0.6368173781783356, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 32207 + }, + { + "epoch": 0.32208, + "grad_norm": 0.571687560947578, + "learning_rate": 0.003, + "loss": 4.049, + "step": 32208 + }, + { + "epoch": 0.32209, + "grad_norm": 0.48536364745086374, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 32209 + }, + { + "epoch": 0.3221, + "grad_norm": 0.5017539330898301, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 32210 + }, + { + "epoch": 0.32211, + "grad_norm": 0.5651514044797669, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 32211 + }, + { + "epoch": 0.32212, + "grad_norm": 0.6390221402605062, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 32212 + }, + { + "epoch": 0.32213, + "grad_norm": 0.6826391376508518, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 32213 + }, + { + "epoch": 0.32214, + "grad_norm": 0.7487190103251911, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 32214 + }, + { + "epoch": 0.32215, + "grad_norm": 0.8442246955186923, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 32215 + }, + { + "epoch": 0.32216, + "grad_norm": 1.094984354421479, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 32216 + }, + { + "epoch": 0.32217, + "grad_norm": 1.038931952573693, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 32217 + }, + { + "epoch": 0.32218, + "grad_norm": 0.8935098924138787, + "learning_rate": 0.003, + "loss": 4.035, + "step": 32218 + }, + { + "epoch": 0.32219, + "grad_norm": 0.8249351776654388, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 32219 + }, + { + "epoch": 0.3222, + "grad_norm": 0.8168983319342156, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 32220 + }, + { + "epoch": 0.32221, + "grad_norm": 0.7735788835321203, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 32221 + }, + { + "epoch": 0.32222, + "grad_norm": 0.8034233429912587, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 32222 + }, + { + "epoch": 0.32223, + "grad_norm": 1.005386476597215, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 32223 + }, + { + "epoch": 0.32224, + "grad_norm": 1.2644133675662077, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 32224 + }, + { + "epoch": 0.32225, + "grad_norm": 0.7334006188486157, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 32225 + }, + { + "epoch": 0.32226, + "grad_norm": 0.6514463860960846, + "learning_rate": 0.003, + "loss": 4.0083, + "step": 32226 + }, + { + "epoch": 0.32227, + "grad_norm": 0.7512507739703294, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 32227 + }, + { + "epoch": 0.32228, + "grad_norm": 0.7550455186600876, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 32228 + }, + { + "epoch": 0.32229, + "grad_norm": 0.7218245832051776, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 32229 + }, + { + "epoch": 0.3223, + "grad_norm": 0.7698463998195858, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 32230 + }, + { + "epoch": 0.32231, + "grad_norm": 0.8168956262351593, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 32231 + }, + { + "epoch": 0.32232, + "grad_norm": 0.7339963852839679, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 32232 + }, + { + "epoch": 0.32233, + "grad_norm": 0.7360059569554366, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 32233 + }, + { + "epoch": 0.32234, + "grad_norm": 0.746988054502412, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 32234 + }, + { + "epoch": 0.32235, + "grad_norm": 0.8784104458965614, + "learning_rate": 0.003, + "loss": 3.9938, + "step": 32235 + }, + { + "epoch": 0.32236, + "grad_norm": 0.9218926344024032, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 32236 + }, + { + "epoch": 0.32237, + "grad_norm": 1.0026539694055925, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 32237 + }, + { + "epoch": 0.32238, + "grad_norm": 1.2020239460106188, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 32238 + }, + { + "epoch": 0.32239, + "grad_norm": 0.775546711058531, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 32239 + }, + { + "epoch": 0.3224, + "grad_norm": 0.6462218624913312, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 32240 + }, + { + "epoch": 0.32241, + "grad_norm": 0.7854321609735561, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 32241 + }, + { + "epoch": 0.32242, + "grad_norm": 0.8734837347510679, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 32242 + }, + { + "epoch": 0.32243, + "grad_norm": 0.7635065962445767, + "learning_rate": 0.003, + "loss": 4.057, + "step": 32243 + }, + { + "epoch": 0.32244, + "grad_norm": 0.8199533440039678, + "learning_rate": 0.003, + "loss": 4.068, + "step": 32244 + }, + { + "epoch": 0.32245, + "grad_norm": 0.8425384495622505, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 32245 + }, + { + "epoch": 0.32246, + "grad_norm": 0.9037748150721758, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 32246 + }, + { + "epoch": 0.32247, + "grad_norm": 1.0834000111593438, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 32247 + }, + { + "epoch": 0.32248, + "grad_norm": 1.1907425594444014, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 32248 + }, + { + "epoch": 0.32249, + "grad_norm": 0.9383884373978388, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 32249 + }, + { + "epoch": 0.3225, + "grad_norm": 0.9833057176108768, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 32250 + }, + { + "epoch": 0.32251, + "grad_norm": 0.9738031767988453, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 32251 + }, + { + "epoch": 0.32252, + "grad_norm": 0.9636566641228305, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 32252 + }, + { + "epoch": 0.32253, + "grad_norm": 1.0201503942677856, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 32253 + }, + { + "epoch": 0.32254, + "grad_norm": 0.9424680182651337, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 32254 + }, + { + "epoch": 0.32255, + "grad_norm": 0.9547605731866455, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 32255 + }, + { + "epoch": 0.32256, + "grad_norm": 0.859976616674784, + "learning_rate": 0.003, + "loss": 4.076, + "step": 32256 + }, + { + "epoch": 0.32257, + "grad_norm": 1.0016660211474069, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 32257 + }, + { + "epoch": 0.32258, + "grad_norm": 1.3184395917959293, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 32258 + }, + { + "epoch": 0.32259, + "grad_norm": 0.6678786243340351, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 32259 + }, + { + "epoch": 0.3226, + "grad_norm": 0.7463943905771643, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 32260 + }, + { + "epoch": 0.32261, + "grad_norm": 0.8661045049533692, + "learning_rate": 0.003, + "loss": 4.061, + "step": 32261 + }, + { + "epoch": 0.32262, + "grad_norm": 0.9178604289463683, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 32262 + }, + { + "epoch": 0.32263, + "grad_norm": 0.8620594528146983, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 32263 + }, + { + "epoch": 0.32264, + "grad_norm": 0.7628013365617935, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 32264 + }, + { + "epoch": 0.32265, + "grad_norm": 0.756130514301219, + "learning_rate": 0.003, + "loss": 4.04, + "step": 32265 + }, + { + "epoch": 0.32266, + "grad_norm": 0.834626731311854, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 32266 + }, + { + "epoch": 0.32267, + "grad_norm": 0.8306501767475094, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 32267 + }, + { + "epoch": 0.32268, + "grad_norm": 0.7783538083204286, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 32268 + }, + { + "epoch": 0.32269, + "grad_norm": 0.6832771857852484, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 32269 + }, + { + "epoch": 0.3227, + "grad_norm": 0.6279022135986003, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 32270 + }, + { + "epoch": 0.32271, + "grad_norm": 0.7990760164346076, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 32271 + }, + { + "epoch": 0.32272, + "grad_norm": 1.1266145440985458, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 32272 + }, + { + "epoch": 0.32273, + "grad_norm": 1.2711223108728127, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 32273 + }, + { + "epoch": 0.32274, + "grad_norm": 0.7119963235200429, + "learning_rate": 0.003, + "loss": 4.013, + "step": 32274 + }, + { + "epoch": 0.32275, + "grad_norm": 0.7682000239029506, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 32275 + }, + { + "epoch": 0.32276, + "grad_norm": 0.71769357951076, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 32276 + }, + { + "epoch": 0.32277, + "grad_norm": 0.6932969427499185, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 32277 + }, + { + "epoch": 0.32278, + "grad_norm": 0.6570708939940232, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 32278 + }, + { + "epoch": 0.32279, + "grad_norm": 0.5824043989436387, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 32279 + }, + { + "epoch": 0.3228, + "grad_norm": 0.5444073045830781, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 32280 + }, + { + "epoch": 0.32281, + "grad_norm": 0.6581936598450012, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 32281 + }, + { + "epoch": 0.32282, + "grad_norm": 0.9393683566391791, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 32282 + }, + { + "epoch": 0.32283, + "grad_norm": 1.138155339324421, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 32283 + }, + { + "epoch": 0.32284, + "grad_norm": 0.8428264184836244, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 32284 + }, + { + "epoch": 0.32285, + "grad_norm": 0.8812750638218401, + "learning_rate": 0.003, + "loss": 4.032, + "step": 32285 + }, + { + "epoch": 0.32286, + "grad_norm": 0.9279909368478223, + "learning_rate": 0.003, + "loss": 4.044, + "step": 32286 + }, + { + "epoch": 0.32287, + "grad_norm": 1.0236116765055276, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 32287 + }, + { + "epoch": 0.32288, + "grad_norm": 0.9126246407422599, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 32288 + }, + { + "epoch": 0.32289, + "grad_norm": 0.8336149842992552, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 32289 + }, + { + "epoch": 0.3229, + "grad_norm": 0.8977170598224049, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 32290 + }, + { + "epoch": 0.32291, + "grad_norm": 0.9226282466663561, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 32291 + }, + { + "epoch": 0.32292, + "grad_norm": 0.8691613370896669, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 32292 + }, + { + "epoch": 0.32293, + "grad_norm": 0.7741567960866625, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 32293 + }, + { + "epoch": 0.32294, + "grad_norm": 0.8100276953842075, + "learning_rate": 0.003, + "loss": 4.062, + "step": 32294 + }, + { + "epoch": 0.32295, + "grad_norm": 0.7670113540564968, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 32295 + }, + { + "epoch": 0.32296, + "grad_norm": 0.7065423049337619, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 32296 + }, + { + "epoch": 0.32297, + "grad_norm": 0.7762058151688797, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 32297 + }, + { + "epoch": 0.32298, + "grad_norm": 0.7300176619704652, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 32298 + }, + { + "epoch": 0.32299, + "grad_norm": 0.6708670946918538, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 32299 + }, + { + "epoch": 0.323, + "grad_norm": 0.700138469446253, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 32300 + }, + { + "epoch": 0.32301, + "grad_norm": 0.8009185701203718, + "learning_rate": 0.003, + "loss": 4.014, + "step": 32301 + }, + { + "epoch": 0.32302, + "grad_norm": 0.9167397325643818, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 32302 + }, + { + "epoch": 0.32303, + "grad_norm": 1.101083733716767, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 32303 + }, + { + "epoch": 0.32304, + "grad_norm": 1.0676385010938045, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 32304 + }, + { + "epoch": 0.32305, + "grad_norm": 0.8468035259911453, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 32305 + }, + { + "epoch": 0.32306, + "grad_norm": 0.7593810368585876, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 32306 + }, + { + "epoch": 0.32307, + "grad_norm": 0.7474168188498049, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 32307 + }, + { + "epoch": 0.32308, + "grad_norm": 0.8604325683152758, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 32308 + }, + { + "epoch": 0.32309, + "grad_norm": 0.9743420119431495, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 32309 + }, + { + "epoch": 0.3231, + "grad_norm": 1.0430454088754981, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 32310 + }, + { + "epoch": 0.32311, + "grad_norm": 1.1703433789662439, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 32311 + }, + { + "epoch": 0.32312, + "grad_norm": 0.8182212921381029, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 32312 + }, + { + "epoch": 0.32313, + "grad_norm": 0.6888783538732546, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 32313 + }, + { + "epoch": 0.32314, + "grad_norm": 0.8264579931993787, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 32314 + }, + { + "epoch": 0.32315, + "grad_norm": 0.8145009516368978, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 32315 + }, + { + "epoch": 0.32316, + "grad_norm": 0.9069230108730862, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 32316 + }, + { + "epoch": 0.32317, + "grad_norm": 1.0212741283976756, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 32317 + }, + { + "epoch": 0.32318, + "grad_norm": 0.8522968841919638, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 32318 + }, + { + "epoch": 0.32319, + "grad_norm": 0.8855478760880819, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 32319 + }, + { + "epoch": 0.3232, + "grad_norm": 0.8817897215981032, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 32320 + }, + { + "epoch": 0.32321, + "grad_norm": 0.7216924611504967, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 32321 + }, + { + "epoch": 0.32322, + "grad_norm": 0.6449237058642023, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 32322 + }, + { + "epoch": 0.32323, + "grad_norm": 0.7448239582745684, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 32323 + }, + { + "epoch": 0.32324, + "grad_norm": 0.8856571268079174, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 32324 + }, + { + "epoch": 0.32325, + "grad_norm": 0.9705643939177859, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 32325 + }, + { + "epoch": 0.32326, + "grad_norm": 1.0406156815874361, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 32326 + }, + { + "epoch": 0.32327, + "grad_norm": 0.8922499633547784, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 32327 + }, + { + "epoch": 0.32328, + "grad_norm": 0.8257915581360175, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 32328 + }, + { + "epoch": 0.32329, + "grad_norm": 0.8189930751473851, + "learning_rate": 0.003, + "loss": 4.035, + "step": 32329 + }, + { + "epoch": 0.3233, + "grad_norm": 1.1155201794963976, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 32330 + }, + { + "epoch": 0.32331, + "grad_norm": 1.1746602607208694, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 32331 + }, + { + "epoch": 0.32332, + "grad_norm": 0.9523337206799993, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 32332 + }, + { + "epoch": 0.32333, + "grad_norm": 0.962626129183462, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 32333 + }, + { + "epoch": 0.32334, + "grad_norm": 0.9314461493164828, + "learning_rate": 0.003, + "loss": 4.029, + "step": 32334 + }, + { + "epoch": 0.32335, + "grad_norm": 0.7536293531586414, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 32335 + }, + { + "epoch": 0.32336, + "grad_norm": 0.6867427069768643, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 32336 + }, + { + "epoch": 0.32337, + "grad_norm": 0.6561471057554228, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 32337 + }, + { + "epoch": 0.32338, + "grad_norm": 0.6679139771988902, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 32338 + }, + { + "epoch": 0.32339, + "grad_norm": 0.6970902407564015, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 32339 + }, + { + "epoch": 0.3234, + "grad_norm": 0.6461613969036247, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 32340 + }, + { + "epoch": 0.32341, + "grad_norm": 0.6775312452786895, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 32341 + }, + { + "epoch": 0.32342, + "grad_norm": 0.7712636293850984, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 32342 + }, + { + "epoch": 0.32343, + "grad_norm": 0.8373951220628817, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 32343 + }, + { + "epoch": 0.32344, + "grad_norm": 0.8704866617007142, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 32344 + }, + { + "epoch": 0.32345, + "grad_norm": 0.7823367022192289, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 32345 + }, + { + "epoch": 0.32346, + "grad_norm": 0.6420866673437643, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 32346 + }, + { + "epoch": 0.32347, + "grad_norm": 0.6193222894922366, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 32347 + }, + { + "epoch": 0.32348, + "grad_norm": 0.5890217712363472, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 32348 + }, + { + "epoch": 0.32349, + "grad_norm": 0.5653786811237625, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 32349 + }, + { + "epoch": 0.3235, + "grad_norm": 0.5455095076404681, + "learning_rate": 0.003, + "loss": 4.046, + "step": 32350 + }, + { + "epoch": 0.32351, + "grad_norm": 0.5684783099105587, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 32351 + }, + { + "epoch": 0.32352, + "grad_norm": 0.5679456273729846, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 32352 + }, + { + "epoch": 0.32353, + "grad_norm": 0.6745113843365564, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 32353 + }, + { + "epoch": 0.32354, + "grad_norm": 0.6761177252948698, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 32354 + }, + { + "epoch": 0.32355, + "grad_norm": 0.7791159911491643, + "learning_rate": 0.003, + "loss": 4.04, + "step": 32355 + }, + { + "epoch": 0.32356, + "grad_norm": 1.0139570939564477, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 32356 + }, + { + "epoch": 0.32357, + "grad_norm": 1.293244369561617, + "learning_rate": 0.003, + "loss": 4.079, + "step": 32357 + }, + { + "epoch": 0.32358, + "grad_norm": 0.6584405051639541, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 32358 + }, + { + "epoch": 0.32359, + "grad_norm": 0.6727183254827315, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 32359 + }, + { + "epoch": 0.3236, + "grad_norm": 0.7664880022610624, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 32360 + }, + { + "epoch": 0.32361, + "grad_norm": 0.8120060101276529, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 32361 + }, + { + "epoch": 0.32362, + "grad_norm": 0.8451679524243048, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 32362 + }, + { + "epoch": 0.32363, + "grad_norm": 0.7601203280413358, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 32363 + }, + { + "epoch": 0.32364, + "grad_norm": 0.7520953416943923, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 32364 + }, + { + "epoch": 0.32365, + "grad_norm": 0.8464035136546044, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 32365 + }, + { + "epoch": 0.32366, + "grad_norm": 0.8860531317365918, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 32366 + }, + { + "epoch": 0.32367, + "grad_norm": 0.9056158035887893, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 32367 + }, + { + "epoch": 0.32368, + "grad_norm": 1.002249721020873, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 32368 + }, + { + "epoch": 0.32369, + "grad_norm": 1.118616940436004, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 32369 + }, + { + "epoch": 0.3237, + "grad_norm": 1.0579637306002452, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 32370 + }, + { + "epoch": 0.32371, + "grad_norm": 1.0490792022585964, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 32371 + }, + { + "epoch": 0.32372, + "grad_norm": 1.1171385781613343, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 32372 + }, + { + "epoch": 0.32373, + "grad_norm": 1.0230382376870235, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 32373 + }, + { + "epoch": 0.32374, + "grad_norm": 1.1772239733294187, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 32374 + }, + { + "epoch": 0.32375, + "grad_norm": 0.9701301214444298, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 32375 + }, + { + "epoch": 0.32376, + "grad_norm": 0.9407467209597434, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 32376 + }, + { + "epoch": 0.32377, + "grad_norm": 0.8615311887723703, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 32377 + }, + { + "epoch": 0.32378, + "grad_norm": 0.7787250639625689, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 32378 + }, + { + "epoch": 0.32379, + "grad_norm": 0.7593865515816075, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 32379 + }, + { + "epoch": 0.3238, + "grad_norm": 0.7605450701845645, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 32380 + }, + { + "epoch": 0.32381, + "grad_norm": 0.7219943621367391, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 32381 + }, + { + "epoch": 0.32382, + "grad_norm": 0.6559584563119266, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 32382 + }, + { + "epoch": 0.32383, + "grad_norm": 0.6982615909167791, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 32383 + }, + { + "epoch": 0.32384, + "grad_norm": 0.8372686169003345, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 32384 + }, + { + "epoch": 0.32385, + "grad_norm": 0.9943234170786358, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 32385 + }, + { + "epoch": 0.32386, + "grad_norm": 0.9797029653688999, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 32386 + }, + { + "epoch": 0.32387, + "grad_norm": 0.8894974745721331, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 32387 + }, + { + "epoch": 0.32388, + "grad_norm": 0.9177078182331543, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 32388 + }, + { + "epoch": 0.32389, + "grad_norm": 0.924833662187325, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 32389 + }, + { + "epoch": 0.3239, + "grad_norm": 0.9495545063290977, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 32390 + }, + { + "epoch": 0.32391, + "grad_norm": 0.9719076719725539, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 32391 + }, + { + "epoch": 0.32392, + "grad_norm": 1.0310010019892792, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 32392 + }, + { + "epoch": 0.32393, + "grad_norm": 1.0116185247720566, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 32393 + }, + { + "epoch": 0.32394, + "grad_norm": 0.9235086129935368, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 32394 + }, + { + "epoch": 0.32395, + "grad_norm": 0.8386486816405722, + "learning_rate": 0.003, + "loss": 4.058, + "step": 32395 + }, + { + "epoch": 0.32396, + "grad_norm": 0.7823133825296625, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 32396 + }, + { + "epoch": 0.32397, + "grad_norm": 0.8079520393848514, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 32397 + }, + { + "epoch": 0.32398, + "grad_norm": 0.8157863464018793, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 32398 + }, + { + "epoch": 0.32399, + "grad_norm": 0.8350030055320209, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 32399 + }, + { + "epoch": 0.324, + "grad_norm": 0.9797250201811051, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 32400 + }, + { + "epoch": 0.32401, + "grad_norm": 1.1273662009011625, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 32401 + }, + { + "epoch": 0.32402, + "grad_norm": 0.8096029942371695, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 32402 + }, + { + "epoch": 0.32403, + "grad_norm": 0.6668564405196714, + "learning_rate": 0.003, + "loss": 4.049, + "step": 32403 + }, + { + "epoch": 0.32404, + "grad_norm": 0.6721271658434185, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 32404 + }, + { + "epoch": 0.32405, + "grad_norm": 0.6740651942228963, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 32405 + }, + { + "epoch": 0.32406, + "grad_norm": 0.7371625302860473, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 32406 + }, + { + "epoch": 0.32407, + "grad_norm": 0.8571869973070066, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 32407 + }, + { + "epoch": 0.32408, + "grad_norm": 1.0757907004249518, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 32408 + }, + { + "epoch": 0.32409, + "grad_norm": 0.8630219760202921, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 32409 + }, + { + "epoch": 0.3241, + "grad_norm": 0.6058049018252737, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 32410 + }, + { + "epoch": 0.32411, + "grad_norm": 0.5818891282596081, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 32411 + }, + { + "epoch": 0.32412, + "grad_norm": 0.6050632525656291, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 32412 + }, + { + "epoch": 0.32413, + "grad_norm": 0.6526553256802238, + "learning_rate": 0.003, + "loss": 4.059, + "step": 32413 + }, + { + "epoch": 0.32414, + "grad_norm": 0.651872876229171, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 32414 + }, + { + "epoch": 0.32415, + "grad_norm": 0.6540530871445872, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 32415 + }, + { + "epoch": 0.32416, + "grad_norm": 0.6574382732666066, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 32416 + }, + { + "epoch": 0.32417, + "grad_norm": 0.6097716019132938, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 32417 + }, + { + "epoch": 0.32418, + "grad_norm": 0.5955253668299725, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 32418 + }, + { + "epoch": 0.32419, + "grad_norm": 0.6560988848031566, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 32419 + }, + { + "epoch": 0.3242, + "grad_norm": 0.7748700116349909, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 32420 + }, + { + "epoch": 0.32421, + "grad_norm": 0.8842548857297221, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 32421 + }, + { + "epoch": 0.32422, + "grad_norm": 1.008816486167834, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 32422 + }, + { + "epoch": 0.32423, + "grad_norm": 1.2043388149518008, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 32423 + }, + { + "epoch": 0.32424, + "grad_norm": 1.131819484550399, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 32424 + }, + { + "epoch": 0.32425, + "grad_norm": 0.7525013442716076, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 32425 + }, + { + "epoch": 0.32426, + "grad_norm": 0.8417635683290521, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 32426 + }, + { + "epoch": 0.32427, + "grad_norm": 0.8872114675093807, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 32427 + }, + { + "epoch": 0.32428, + "grad_norm": 0.8177402936458994, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 32428 + }, + { + "epoch": 0.32429, + "grad_norm": 0.8061473810521187, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 32429 + }, + { + "epoch": 0.3243, + "grad_norm": 0.7979180966816082, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 32430 + }, + { + "epoch": 0.32431, + "grad_norm": 0.9406104183134433, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 32431 + }, + { + "epoch": 0.32432, + "grad_norm": 1.1706785518644378, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 32432 + }, + { + "epoch": 0.32433, + "grad_norm": 0.9204040336502325, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 32433 + }, + { + "epoch": 0.32434, + "grad_norm": 0.8480485495319959, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 32434 + }, + { + "epoch": 0.32435, + "grad_norm": 0.78424934053959, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 32435 + }, + { + "epoch": 0.32436, + "grad_norm": 0.6617567979613356, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 32436 + }, + { + "epoch": 0.32437, + "grad_norm": 0.6341200918053179, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 32437 + }, + { + "epoch": 0.32438, + "grad_norm": 0.560160292452773, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 32438 + }, + { + "epoch": 0.32439, + "grad_norm": 0.6611119797878421, + "learning_rate": 0.003, + "loss": 4.016, + "step": 32439 + }, + { + "epoch": 0.3244, + "grad_norm": 0.8011911855796873, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 32440 + }, + { + "epoch": 0.32441, + "grad_norm": 0.9568399056004658, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 32441 + }, + { + "epoch": 0.32442, + "grad_norm": 1.081974197471788, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 32442 + }, + { + "epoch": 0.32443, + "grad_norm": 0.9474284962697, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 32443 + }, + { + "epoch": 0.32444, + "grad_norm": 0.946480447449302, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 32444 + }, + { + "epoch": 0.32445, + "grad_norm": 0.8314780218085125, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 32445 + }, + { + "epoch": 0.32446, + "grad_norm": 0.7354593369045038, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 32446 + }, + { + "epoch": 0.32447, + "grad_norm": 0.7420538227228154, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 32447 + }, + { + "epoch": 0.32448, + "grad_norm": 0.7314345987547949, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 32448 + }, + { + "epoch": 0.32449, + "grad_norm": 0.8517977352026128, + "learning_rate": 0.003, + "loss": 4.042, + "step": 32449 + }, + { + "epoch": 0.3245, + "grad_norm": 0.8777509225249325, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 32450 + }, + { + "epoch": 0.32451, + "grad_norm": 0.8615559113619968, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 32451 + }, + { + "epoch": 0.32452, + "grad_norm": 0.990937090091958, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 32452 + }, + { + "epoch": 0.32453, + "grad_norm": 1.1731555510835727, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 32453 + }, + { + "epoch": 0.32454, + "grad_norm": 0.797749326571795, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 32454 + }, + { + "epoch": 0.32455, + "grad_norm": 0.8559594645003353, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 32455 + }, + { + "epoch": 0.32456, + "grad_norm": 0.9064903507231489, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 32456 + }, + { + "epoch": 0.32457, + "grad_norm": 0.816951819575074, + "learning_rate": 0.003, + "loss": 4.066, + "step": 32457 + }, + { + "epoch": 0.32458, + "grad_norm": 0.8267650975129449, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 32458 + }, + { + "epoch": 0.32459, + "grad_norm": 0.8221406596182723, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 32459 + }, + { + "epoch": 0.3246, + "grad_norm": 0.96660614998581, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 32460 + }, + { + "epoch": 0.32461, + "grad_norm": 1.0104880855751777, + "learning_rate": 0.003, + "loss": 4.068, + "step": 32461 + }, + { + "epoch": 0.32462, + "grad_norm": 0.9785938381122375, + "learning_rate": 0.003, + "loss": 4.023, + "step": 32462 + }, + { + "epoch": 0.32463, + "grad_norm": 0.9911124277645665, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 32463 + }, + { + "epoch": 0.32464, + "grad_norm": 0.8342693971009238, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 32464 + }, + { + "epoch": 0.32465, + "grad_norm": 0.7638744435924395, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 32465 + }, + { + "epoch": 0.32466, + "grad_norm": 0.6585887336018129, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 32466 + }, + { + "epoch": 0.32467, + "grad_norm": 0.6786482903148481, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 32467 + }, + { + "epoch": 0.32468, + "grad_norm": 0.7462272688086826, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 32468 + }, + { + "epoch": 0.32469, + "grad_norm": 0.8402071689120815, + "learning_rate": 0.003, + "loss": 4.037, + "step": 32469 + }, + { + "epoch": 0.3247, + "grad_norm": 0.9419210745109278, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 32470 + }, + { + "epoch": 0.32471, + "grad_norm": 1.1349618248796147, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 32471 + }, + { + "epoch": 0.32472, + "grad_norm": 0.7739083411568652, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 32472 + }, + { + "epoch": 0.32473, + "grad_norm": 0.7534394848629673, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 32473 + }, + { + "epoch": 0.32474, + "grad_norm": 0.7538361034758573, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 32474 + }, + { + "epoch": 0.32475, + "grad_norm": 0.8317476640101414, + "learning_rate": 0.003, + "loss": 4.036, + "step": 32475 + }, + { + "epoch": 0.32476, + "grad_norm": 0.962108479555826, + "learning_rate": 0.003, + "loss": 4.036, + "step": 32476 + }, + { + "epoch": 0.32477, + "grad_norm": 1.0076634509523166, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 32477 + }, + { + "epoch": 0.32478, + "grad_norm": 0.9524713160013971, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 32478 + }, + { + "epoch": 0.32479, + "grad_norm": 0.8938946145068402, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 32479 + }, + { + "epoch": 0.3248, + "grad_norm": 0.866598713696185, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 32480 + }, + { + "epoch": 0.32481, + "grad_norm": 0.9518292706137309, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 32481 + }, + { + "epoch": 0.32482, + "grad_norm": 1.0706677475728912, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 32482 + }, + { + "epoch": 0.32483, + "grad_norm": 0.9377819750666391, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 32483 + }, + { + "epoch": 0.32484, + "grad_norm": 0.8795501248471351, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 32484 + }, + { + "epoch": 0.32485, + "grad_norm": 0.8313086996567106, + "learning_rate": 0.003, + "loss": 4.06, + "step": 32485 + }, + { + "epoch": 0.32486, + "grad_norm": 0.8114500049966729, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 32486 + }, + { + "epoch": 0.32487, + "grad_norm": 0.7933047784511851, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 32487 + }, + { + "epoch": 0.32488, + "grad_norm": 0.8284654799040798, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 32488 + }, + { + "epoch": 0.32489, + "grad_norm": 0.8210404517957974, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 32489 + }, + { + "epoch": 0.3249, + "grad_norm": 0.8459774536816805, + "learning_rate": 0.003, + "loss": 4.054, + "step": 32490 + }, + { + "epoch": 0.32491, + "grad_norm": 0.9110390501635937, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 32491 + }, + { + "epoch": 0.32492, + "grad_norm": 0.9012254038577967, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 32492 + }, + { + "epoch": 0.32493, + "grad_norm": 0.9101919667441966, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 32493 + }, + { + "epoch": 0.32494, + "grad_norm": 0.993720091782351, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 32494 + }, + { + "epoch": 0.32495, + "grad_norm": 0.9721134862123603, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 32495 + }, + { + "epoch": 0.32496, + "grad_norm": 0.9936382312282888, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 32496 + }, + { + "epoch": 0.32497, + "grad_norm": 0.9546097052850503, + "learning_rate": 0.003, + "loss": 4.051, + "step": 32497 + }, + { + "epoch": 0.32498, + "grad_norm": 0.8523461580101721, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 32498 + }, + { + "epoch": 0.32499, + "grad_norm": 0.9402993849427704, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 32499 + }, + { + "epoch": 0.325, + "grad_norm": 0.8599630797629967, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 32500 + }, + { + "epoch": 0.32501, + "grad_norm": 0.8505517313609147, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 32501 + }, + { + "epoch": 0.32502, + "grad_norm": 0.8695682316241959, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 32502 + }, + { + "epoch": 0.32503, + "grad_norm": 0.9859328747547556, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 32503 + }, + { + "epoch": 0.32504, + "grad_norm": 0.9527466740339436, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 32504 + }, + { + "epoch": 0.32505, + "grad_norm": 0.8359264286724243, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 32505 + }, + { + "epoch": 0.32506, + "grad_norm": 0.8839760459143862, + "learning_rate": 0.003, + "loss": 4.061, + "step": 32506 + }, + { + "epoch": 0.32507, + "grad_norm": 0.8822061749609003, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 32507 + }, + { + "epoch": 0.32508, + "grad_norm": 0.8653949652705336, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 32508 + }, + { + "epoch": 0.32509, + "grad_norm": 0.9037035648499723, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 32509 + }, + { + "epoch": 0.3251, + "grad_norm": 0.9159744382892694, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 32510 + }, + { + "epoch": 0.32511, + "grad_norm": 0.977121533159995, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 32511 + }, + { + "epoch": 0.32512, + "grad_norm": 1.0465938638963157, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 32512 + }, + { + "epoch": 0.32513, + "grad_norm": 1.0311322826718887, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 32513 + }, + { + "epoch": 0.32514, + "grad_norm": 0.8029986298815406, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 32514 + }, + { + "epoch": 0.32515, + "grad_norm": 0.7241715100164978, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 32515 + }, + { + "epoch": 0.32516, + "grad_norm": 0.7662160623473624, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 32516 + }, + { + "epoch": 0.32517, + "grad_norm": 0.7749811847695717, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 32517 + }, + { + "epoch": 0.32518, + "grad_norm": 0.7929216707209552, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 32518 + }, + { + "epoch": 0.32519, + "grad_norm": 0.6921712562372632, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 32519 + }, + { + "epoch": 0.3252, + "grad_norm": 0.6202446185540131, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 32520 + }, + { + "epoch": 0.32521, + "grad_norm": 0.6068708335721467, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 32521 + }, + { + "epoch": 0.32522, + "grad_norm": 0.5791522484779299, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 32522 + }, + { + "epoch": 0.32523, + "grad_norm": 0.6359440606138411, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 32523 + }, + { + "epoch": 0.32524, + "grad_norm": 0.6300473151207283, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 32524 + }, + { + "epoch": 0.32525, + "grad_norm": 0.5875943235496611, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 32525 + }, + { + "epoch": 0.32526, + "grad_norm": 0.6460274321883364, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 32526 + }, + { + "epoch": 0.32527, + "grad_norm": 0.7176503979657928, + "learning_rate": 0.003, + "loss": 4.0018, + "step": 32527 + }, + { + "epoch": 0.32528, + "grad_norm": 0.7569674709344677, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 32528 + }, + { + "epoch": 0.32529, + "grad_norm": 0.7634946695355567, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 32529 + }, + { + "epoch": 0.3253, + "grad_norm": 0.8185039735871804, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 32530 + }, + { + "epoch": 0.32531, + "grad_norm": 0.8879626382820008, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 32531 + }, + { + "epoch": 0.32532, + "grad_norm": 0.9563229441586824, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 32532 + }, + { + "epoch": 0.32533, + "grad_norm": 1.0005716778938123, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 32533 + }, + { + "epoch": 0.32534, + "grad_norm": 1.1631716319156706, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 32534 + }, + { + "epoch": 0.32535, + "grad_norm": 0.9404326691036433, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 32535 + }, + { + "epoch": 0.32536, + "grad_norm": 0.9024408873572741, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 32536 + }, + { + "epoch": 0.32537, + "grad_norm": 0.932940203114261, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 32537 + }, + { + "epoch": 0.32538, + "grad_norm": 1.0129567466035616, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 32538 + }, + { + "epoch": 0.32539, + "grad_norm": 0.8735338061609483, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 32539 + }, + { + "epoch": 0.3254, + "grad_norm": 0.7569360225968319, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 32540 + }, + { + "epoch": 0.32541, + "grad_norm": 0.7531630462745532, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 32541 + }, + { + "epoch": 0.32542, + "grad_norm": 0.6753138375391916, + "learning_rate": 0.003, + "loss": 4.066, + "step": 32542 + }, + { + "epoch": 0.32543, + "grad_norm": 0.6912591709437553, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 32543 + }, + { + "epoch": 0.32544, + "grad_norm": 0.6378119202634904, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 32544 + }, + { + "epoch": 0.32545, + "grad_norm": 0.5736197603656417, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 32545 + }, + { + "epoch": 0.32546, + "grad_norm": 0.567805166502217, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 32546 + }, + { + "epoch": 0.32547, + "grad_norm": 0.570924134773479, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 32547 + }, + { + "epoch": 0.32548, + "grad_norm": 0.6161073908471861, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 32548 + }, + { + "epoch": 0.32549, + "grad_norm": 0.7255282757326842, + "learning_rate": 0.003, + "loss": 4.0044, + "step": 32549 + }, + { + "epoch": 0.3255, + "grad_norm": 1.0696041458316223, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 32550 + }, + { + "epoch": 0.32551, + "grad_norm": 1.1798883898243353, + "learning_rate": 0.003, + "loss": 4.021, + "step": 32551 + }, + { + "epoch": 0.32552, + "grad_norm": 0.8727814034711785, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 32552 + }, + { + "epoch": 0.32553, + "grad_norm": 0.8226178698220777, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 32553 + }, + { + "epoch": 0.32554, + "grad_norm": 0.7775382204150688, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 32554 + }, + { + "epoch": 0.32555, + "grad_norm": 0.8981569102385294, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 32555 + }, + { + "epoch": 0.32556, + "grad_norm": 0.8666164341413275, + "learning_rate": 0.003, + "loss": 4.051, + "step": 32556 + }, + { + "epoch": 0.32557, + "grad_norm": 0.8765939746411787, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 32557 + }, + { + "epoch": 0.32558, + "grad_norm": 0.8992140367605779, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 32558 + }, + { + "epoch": 0.32559, + "grad_norm": 0.9739244730448949, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 32559 + }, + { + "epoch": 0.3256, + "grad_norm": 1.0792799810872777, + "learning_rate": 0.003, + "loss": 4.069, + "step": 32560 + }, + { + "epoch": 0.32561, + "grad_norm": 1.1153840983791787, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 32561 + }, + { + "epoch": 0.32562, + "grad_norm": 0.8527095903119498, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 32562 + }, + { + "epoch": 0.32563, + "grad_norm": 0.7073353847577559, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 32563 + }, + { + "epoch": 0.32564, + "grad_norm": 0.7018040334115425, + "learning_rate": 0.003, + "loss": 4.035, + "step": 32564 + }, + { + "epoch": 0.32565, + "grad_norm": 0.6825879397660947, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 32565 + }, + { + "epoch": 0.32566, + "grad_norm": 0.7902643142545418, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 32566 + }, + { + "epoch": 0.32567, + "grad_norm": 0.9439713931740356, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 32567 + }, + { + "epoch": 0.32568, + "grad_norm": 1.0969949600602686, + "learning_rate": 0.003, + "loss": 4.059, + "step": 32568 + }, + { + "epoch": 0.32569, + "grad_norm": 1.0292360273393177, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 32569 + }, + { + "epoch": 0.3257, + "grad_norm": 0.8395614529442065, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 32570 + }, + { + "epoch": 0.32571, + "grad_norm": 0.7642680841081277, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 32571 + }, + { + "epoch": 0.32572, + "grad_norm": 0.7343925241461134, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 32572 + }, + { + "epoch": 0.32573, + "grad_norm": 0.8178668749519237, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 32573 + }, + { + "epoch": 0.32574, + "grad_norm": 0.8871834598925148, + "learning_rate": 0.003, + "loss": 4.04, + "step": 32574 + }, + { + "epoch": 0.32575, + "grad_norm": 0.8157414758277594, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 32575 + }, + { + "epoch": 0.32576, + "grad_norm": 0.7082762149351927, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 32576 + }, + { + "epoch": 0.32577, + "grad_norm": 0.7311944200233875, + "learning_rate": 0.003, + "loss": 4.0074, + "step": 32577 + }, + { + "epoch": 0.32578, + "grad_norm": 0.7471532571374218, + "learning_rate": 0.003, + "loss": 4.029, + "step": 32578 + }, + { + "epoch": 0.32579, + "grad_norm": 0.9053340254333775, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 32579 + }, + { + "epoch": 0.3258, + "grad_norm": 1.0563485693937398, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 32580 + }, + { + "epoch": 0.32581, + "grad_norm": 1.0730855603732499, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 32581 + }, + { + "epoch": 0.32582, + "grad_norm": 0.8222737850350875, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 32582 + }, + { + "epoch": 0.32583, + "grad_norm": 0.9230544805793801, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 32583 + }, + { + "epoch": 0.32584, + "grad_norm": 1.0902773122066156, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 32584 + }, + { + "epoch": 0.32585, + "grad_norm": 1.001979374489625, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 32585 + }, + { + "epoch": 0.32586, + "grad_norm": 0.9612509735145369, + "learning_rate": 0.003, + "loss": 4.048, + "step": 32586 + }, + { + "epoch": 0.32587, + "grad_norm": 0.9155126535035115, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 32587 + }, + { + "epoch": 0.32588, + "grad_norm": 0.9090094495265286, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 32588 + }, + { + "epoch": 0.32589, + "grad_norm": 0.8870907521648331, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 32589 + }, + { + "epoch": 0.3259, + "grad_norm": 0.7950450459390722, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 32590 + }, + { + "epoch": 0.32591, + "grad_norm": 0.7385842791518953, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 32591 + }, + { + "epoch": 0.32592, + "grad_norm": 0.6941728578943309, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 32592 + }, + { + "epoch": 0.32593, + "grad_norm": 0.6918214189760913, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 32593 + }, + { + "epoch": 0.32594, + "grad_norm": 0.770830115065779, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 32594 + }, + { + "epoch": 0.32595, + "grad_norm": 0.8530177736738423, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 32595 + }, + { + "epoch": 0.32596, + "grad_norm": 1.02183969715601, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 32596 + }, + { + "epoch": 0.32597, + "grad_norm": 0.8430391367372441, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 32597 + }, + { + "epoch": 0.32598, + "grad_norm": 0.7469850574350272, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 32598 + }, + { + "epoch": 0.32599, + "grad_norm": 0.8301538375710558, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 32599 + }, + { + "epoch": 0.326, + "grad_norm": 0.8997271591832265, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 32600 + }, + { + "epoch": 0.32601, + "grad_norm": 0.9684148711140967, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 32601 + }, + { + "epoch": 0.32602, + "grad_norm": 1.2299429624540272, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 32602 + }, + { + "epoch": 0.32603, + "grad_norm": 1.0057955640830745, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 32603 + }, + { + "epoch": 0.32604, + "grad_norm": 0.900385274695718, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 32604 + }, + { + "epoch": 0.32605, + "grad_norm": 0.8151598244475973, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 32605 + }, + { + "epoch": 0.32606, + "grad_norm": 0.783905914434776, + "learning_rate": 0.003, + "loss": 4.048, + "step": 32606 + }, + { + "epoch": 0.32607, + "grad_norm": 0.844341922226867, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 32607 + }, + { + "epoch": 0.32608, + "grad_norm": 0.8542859851316374, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 32608 + }, + { + "epoch": 0.32609, + "grad_norm": 0.974555891891903, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 32609 + }, + { + "epoch": 0.3261, + "grad_norm": 1.0126762032812027, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 32610 + }, + { + "epoch": 0.32611, + "grad_norm": 0.9876399508552836, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 32611 + }, + { + "epoch": 0.32612, + "grad_norm": 0.9750974939779021, + "learning_rate": 0.003, + "loss": 4.039, + "step": 32612 + }, + { + "epoch": 0.32613, + "grad_norm": 0.9660282178438745, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 32613 + }, + { + "epoch": 0.32614, + "grad_norm": 0.8295218995529398, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 32614 + }, + { + "epoch": 0.32615, + "grad_norm": 0.7612581110362032, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 32615 + }, + { + "epoch": 0.32616, + "grad_norm": 0.7779945136489033, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 32616 + }, + { + "epoch": 0.32617, + "grad_norm": 0.6708131965208577, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 32617 + }, + { + "epoch": 0.32618, + "grad_norm": 0.7142469237214132, + "learning_rate": 0.003, + "loss": 4.058, + "step": 32618 + }, + { + "epoch": 0.32619, + "grad_norm": 0.6393939557439822, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 32619 + }, + { + "epoch": 0.3262, + "grad_norm": 0.5527806307421838, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 32620 + }, + { + "epoch": 0.32621, + "grad_norm": 0.5295230904221768, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 32621 + }, + { + "epoch": 0.32622, + "grad_norm": 0.5477791232621022, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 32622 + }, + { + "epoch": 0.32623, + "grad_norm": 0.5362808847597705, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 32623 + }, + { + "epoch": 0.32624, + "grad_norm": 0.538065666078473, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 32624 + }, + { + "epoch": 0.32625, + "grad_norm": 0.6193450727026222, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 32625 + }, + { + "epoch": 0.32626, + "grad_norm": 0.8345618380605104, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 32626 + }, + { + "epoch": 0.32627, + "grad_norm": 1.0550190643870263, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 32627 + }, + { + "epoch": 0.32628, + "grad_norm": 0.9985699048602811, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 32628 + }, + { + "epoch": 0.32629, + "grad_norm": 1.1110949474623533, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 32629 + }, + { + "epoch": 0.3263, + "grad_norm": 0.8758576027769994, + "learning_rate": 0.003, + "loss": 4.039, + "step": 32630 + }, + { + "epoch": 0.32631, + "grad_norm": 0.7954153776303534, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 32631 + }, + { + "epoch": 0.32632, + "grad_norm": 0.7236414386816926, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 32632 + }, + { + "epoch": 0.32633, + "grad_norm": 0.7530452027683179, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 32633 + }, + { + "epoch": 0.32634, + "grad_norm": 0.8438880254603348, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 32634 + }, + { + "epoch": 0.32635, + "grad_norm": 0.8721834088253939, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 32635 + }, + { + "epoch": 0.32636, + "grad_norm": 0.9264135502218175, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 32636 + }, + { + "epoch": 0.32637, + "grad_norm": 1.0075506190400882, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 32637 + }, + { + "epoch": 0.32638, + "grad_norm": 1.0410723311340915, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 32638 + }, + { + "epoch": 0.32639, + "grad_norm": 1.0364610469616946, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 32639 + }, + { + "epoch": 0.3264, + "grad_norm": 0.9252616639099874, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 32640 + }, + { + "epoch": 0.32641, + "grad_norm": 0.9706459880949335, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 32641 + }, + { + "epoch": 0.32642, + "grad_norm": 0.8801712483191553, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 32642 + }, + { + "epoch": 0.32643, + "grad_norm": 0.7759356203642023, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 32643 + }, + { + "epoch": 0.32644, + "grad_norm": 0.7258853963312211, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 32644 + }, + { + "epoch": 0.32645, + "grad_norm": 0.7517368081529687, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 32645 + }, + { + "epoch": 0.32646, + "grad_norm": 0.7749757729682596, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 32646 + }, + { + "epoch": 0.32647, + "grad_norm": 0.7256552981453709, + "learning_rate": 0.003, + "loss": 4.016, + "step": 32647 + }, + { + "epoch": 0.32648, + "grad_norm": 0.7080354367273922, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 32648 + }, + { + "epoch": 0.32649, + "grad_norm": 0.653708836781338, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 32649 + }, + { + "epoch": 0.3265, + "grad_norm": 0.6980347320692092, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 32650 + }, + { + "epoch": 0.32651, + "grad_norm": 0.7180027788283422, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 32651 + }, + { + "epoch": 0.32652, + "grad_norm": 0.6837123250410853, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 32652 + }, + { + "epoch": 0.32653, + "grad_norm": 0.6319153449119808, + "learning_rate": 0.003, + "loss": 4.023, + "step": 32653 + }, + { + "epoch": 0.32654, + "grad_norm": 0.6040908463326176, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 32654 + }, + { + "epoch": 0.32655, + "grad_norm": 0.6489830899567638, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 32655 + }, + { + "epoch": 0.32656, + "grad_norm": 0.7955126444185715, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 32656 + }, + { + "epoch": 0.32657, + "grad_norm": 1.0162980219700033, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 32657 + }, + { + "epoch": 0.32658, + "grad_norm": 1.2259544457286908, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 32658 + }, + { + "epoch": 0.32659, + "grad_norm": 0.8254030540123612, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 32659 + }, + { + "epoch": 0.3266, + "grad_norm": 0.763334812252952, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 32660 + }, + { + "epoch": 0.32661, + "grad_norm": 0.8867958982704471, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 32661 + }, + { + "epoch": 0.32662, + "grad_norm": 1.0904808310847616, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 32662 + }, + { + "epoch": 0.32663, + "grad_norm": 1.0517896808670215, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 32663 + }, + { + "epoch": 0.32664, + "grad_norm": 0.9930361747053034, + "learning_rate": 0.003, + "loss": 4.043, + "step": 32664 + }, + { + "epoch": 0.32665, + "grad_norm": 0.987262923600527, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 32665 + }, + { + "epoch": 0.32666, + "grad_norm": 0.874765296228934, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 32666 + }, + { + "epoch": 0.32667, + "grad_norm": 0.7678586836557916, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 32667 + }, + { + "epoch": 0.32668, + "grad_norm": 0.8181598242740705, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 32668 + }, + { + "epoch": 0.32669, + "grad_norm": 0.7656016343731741, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 32669 + }, + { + "epoch": 0.3267, + "grad_norm": 0.8619763462925668, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 32670 + }, + { + "epoch": 0.32671, + "grad_norm": 0.9582301948223413, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 32671 + }, + { + "epoch": 0.32672, + "grad_norm": 0.9652139678211961, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 32672 + }, + { + "epoch": 0.32673, + "grad_norm": 1.0367779438442388, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 32673 + }, + { + "epoch": 0.32674, + "grad_norm": 0.9659788614990488, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 32674 + }, + { + "epoch": 0.32675, + "grad_norm": 0.881939562643299, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 32675 + }, + { + "epoch": 0.32676, + "grad_norm": 0.7724174547998682, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 32676 + }, + { + "epoch": 0.32677, + "grad_norm": 0.7575952845649925, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 32677 + }, + { + "epoch": 0.32678, + "grad_norm": 0.8084802480084667, + "learning_rate": 0.003, + "loss": 4.034, + "step": 32678 + }, + { + "epoch": 0.32679, + "grad_norm": 0.8625707916712021, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 32679 + }, + { + "epoch": 0.3268, + "grad_norm": 0.9580479265298185, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 32680 + }, + { + "epoch": 0.32681, + "grad_norm": 1.0402644671104464, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 32681 + }, + { + "epoch": 0.32682, + "grad_norm": 0.9472377883004115, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 32682 + }, + { + "epoch": 0.32683, + "grad_norm": 0.8317943075599579, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 32683 + }, + { + "epoch": 0.32684, + "grad_norm": 0.7632418106706249, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 32684 + }, + { + "epoch": 0.32685, + "grad_norm": 0.7534942842463856, + "learning_rate": 0.003, + "loss": 4.065, + "step": 32685 + }, + { + "epoch": 0.32686, + "grad_norm": 0.8284583472894079, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 32686 + }, + { + "epoch": 0.32687, + "grad_norm": 0.9306673896031317, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 32687 + }, + { + "epoch": 0.32688, + "grad_norm": 0.8304650538465979, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 32688 + }, + { + "epoch": 0.32689, + "grad_norm": 0.785430550340263, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 32689 + }, + { + "epoch": 0.3269, + "grad_norm": 0.6266096363208098, + "learning_rate": 0.003, + "loss": 4.026, + "step": 32690 + }, + { + "epoch": 0.32691, + "grad_norm": 0.557663418961099, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 32691 + }, + { + "epoch": 0.32692, + "grad_norm": 0.6189671394753006, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 32692 + }, + { + "epoch": 0.32693, + "grad_norm": 0.6267912499930638, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 32693 + }, + { + "epoch": 0.32694, + "grad_norm": 0.6588836690483691, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 32694 + }, + { + "epoch": 0.32695, + "grad_norm": 0.8181272872686683, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 32695 + }, + { + "epoch": 0.32696, + "grad_norm": 1.0974425053942243, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 32696 + }, + { + "epoch": 0.32697, + "grad_norm": 1.050231939654034, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 32697 + }, + { + "epoch": 0.32698, + "grad_norm": 0.9951396110089871, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 32698 + }, + { + "epoch": 0.32699, + "grad_norm": 0.9862856437889773, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 32699 + }, + { + "epoch": 0.327, + "grad_norm": 0.7648720160015711, + "learning_rate": 0.003, + "loss": 4.031, + "step": 32700 + }, + { + "epoch": 0.32701, + "grad_norm": 0.667367667600846, + "learning_rate": 0.003, + "loss": 3.9832, + "step": 32701 + }, + { + "epoch": 0.32702, + "grad_norm": 0.6424079184842114, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 32702 + }, + { + "epoch": 0.32703, + "grad_norm": 0.5917322200430145, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 32703 + }, + { + "epoch": 0.32704, + "grad_norm": 0.5470828881701201, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 32704 + }, + { + "epoch": 0.32705, + "grad_norm": 0.5138043546739243, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 32705 + }, + { + "epoch": 0.32706, + "grad_norm": 0.5929314003412735, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 32706 + }, + { + "epoch": 0.32707, + "grad_norm": 0.6373304970622402, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 32707 + }, + { + "epoch": 0.32708, + "grad_norm": 0.703832100855302, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 32708 + }, + { + "epoch": 0.32709, + "grad_norm": 0.7549283104495754, + "learning_rate": 0.003, + "loss": 4.0064, + "step": 32709 + }, + { + "epoch": 0.3271, + "grad_norm": 0.7620229290117326, + "learning_rate": 0.003, + "loss": 4.026, + "step": 32710 + }, + { + "epoch": 0.32711, + "grad_norm": 0.7832799994066906, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 32711 + }, + { + "epoch": 0.32712, + "grad_norm": 0.7580807393203416, + "learning_rate": 0.003, + "loss": 4.033, + "step": 32712 + }, + { + "epoch": 0.32713, + "grad_norm": 0.659404874892149, + "learning_rate": 0.003, + "loss": 4.029, + "step": 32713 + }, + { + "epoch": 0.32714, + "grad_norm": 0.7051771385456851, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 32714 + }, + { + "epoch": 0.32715, + "grad_norm": 0.7010914372282241, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 32715 + }, + { + "epoch": 0.32716, + "grad_norm": 0.6621773394646734, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 32716 + }, + { + "epoch": 0.32717, + "grad_norm": 0.6818721600205901, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 32717 + }, + { + "epoch": 0.32718, + "grad_norm": 0.9320988713645613, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 32718 + }, + { + "epoch": 0.32719, + "grad_norm": 1.3932583721752971, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 32719 + }, + { + "epoch": 0.3272, + "grad_norm": 0.9645857869591395, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 32720 + }, + { + "epoch": 0.32721, + "grad_norm": 1.2428984740611644, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 32721 + }, + { + "epoch": 0.32722, + "grad_norm": 0.8873462594108565, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 32722 + }, + { + "epoch": 0.32723, + "grad_norm": 0.7022737471310789, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 32723 + }, + { + "epoch": 0.32724, + "grad_norm": 0.7211253269266785, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 32724 + }, + { + "epoch": 0.32725, + "grad_norm": 0.8661369481518064, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 32725 + }, + { + "epoch": 0.32726, + "grad_norm": 0.9377716395987876, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 32726 + }, + { + "epoch": 0.32727, + "grad_norm": 1.0802130972570518, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 32727 + }, + { + "epoch": 0.32728, + "grad_norm": 1.0363292223459, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 32728 + }, + { + "epoch": 0.32729, + "grad_norm": 1.0393686759906342, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 32729 + }, + { + "epoch": 0.3273, + "grad_norm": 0.8371285469121365, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 32730 + }, + { + "epoch": 0.32731, + "grad_norm": 0.7043283964179214, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 32731 + }, + { + "epoch": 0.32732, + "grad_norm": 0.778717349106579, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 32732 + }, + { + "epoch": 0.32733, + "grad_norm": 0.8882257557875933, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 32733 + }, + { + "epoch": 0.32734, + "grad_norm": 0.829367890903293, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 32734 + }, + { + "epoch": 0.32735, + "grad_norm": 0.7841934051389784, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 32735 + }, + { + "epoch": 0.32736, + "grad_norm": 0.9913421670205981, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 32736 + }, + { + "epoch": 0.32737, + "grad_norm": 1.202738267876105, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 32737 + }, + { + "epoch": 0.32738, + "grad_norm": 0.6559853319127903, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 32738 + }, + { + "epoch": 0.32739, + "grad_norm": 0.6200160688716877, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 32739 + }, + { + "epoch": 0.3274, + "grad_norm": 0.6569301779717273, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 32740 + }, + { + "epoch": 0.32741, + "grad_norm": 0.8037285825040849, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 32741 + }, + { + "epoch": 0.32742, + "grad_norm": 0.9896515288534776, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 32742 + }, + { + "epoch": 0.32743, + "grad_norm": 1.0469308554081531, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 32743 + }, + { + "epoch": 0.32744, + "grad_norm": 0.9763320918160817, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 32744 + }, + { + "epoch": 0.32745, + "grad_norm": 0.9842291833961888, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 32745 + }, + { + "epoch": 0.32746, + "grad_norm": 0.8830098244107689, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 32746 + }, + { + "epoch": 0.32747, + "grad_norm": 0.9544515543146259, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 32747 + }, + { + "epoch": 0.32748, + "grad_norm": 1.1130036547374291, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 32748 + }, + { + "epoch": 0.32749, + "grad_norm": 1.0951183372452218, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 32749 + }, + { + "epoch": 0.3275, + "grad_norm": 1.0124139267030214, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 32750 + }, + { + "epoch": 0.32751, + "grad_norm": 1.0064893755879278, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 32751 + }, + { + "epoch": 0.32752, + "grad_norm": 1.167731086372188, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 32752 + }, + { + "epoch": 0.32753, + "grad_norm": 0.9050129996156847, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 32753 + }, + { + "epoch": 0.32754, + "grad_norm": 0.7887033142312346, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 32754 + }, + { + "epoch": 0.32755, + "grad_norm": 0.8204010796983877, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 32755 + }, + { + "epoch": 0.32756, + "grad_norm": 0.7763758688653838, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 32756 + }, + { + "epoch": 0.32757, + "grad_norm": 0.9575028287853947, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 32757 + }, + { + "epoch": 0.32758, + "grad_norm": 1.0509666025082258, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 32758 + }, + { + "epoch": 0.32759, + "grad_norm": 1.0030692357601934, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 32759 + }, + { + "epoch": 0.3276, + "grad_norm": 0.9734856504793199, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 32760 + }, + { + "epoch": 0.32761, + "grad_norm": 0.8833628174586879, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 32761 + }, + { + "epoch": 0.32762, + "grad_norm": 0.7937855341856025, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 32762 + }, + { + "epoch": 0.32763, + "grad_norm": 0.6951011194511362, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 32763 + }, + { + "epoch": 0.32764, + "grad_norm": 0.7218502057546252, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 32764 + }, + { + "epoch": 0.32765, + "grad_norm": 0.8275231488756186, + "learning_rate": 0.003, + "loss": 4.019, + "step": 32765 + }, + { + "epoch": 0.32766, + "grad_norm": 0.9736101849150745, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 32766 + }, + { + "epoch": 0.32767, + "grad_norm": 1.0231586093677552, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 32767 + }, + { + "epoch": 0.32768, + "grad_norm": 0.861336173411444, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 32768 + }, + { + "epoch": 0.32769, + "grad_norm": 0.7341686431654569, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 32769 + }, + { + "epoch": 0.3277, + "grad_norm": 0.7664811683186009, + "learning_rate": 0.003, + "loss": 4.049, + "step": 32770 + }, + { + "epoch": 0.32771, + "grad_norm": 0.800284485795897, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 32771 + }, + { + "epoch": 0.32772, + "grad_norm": 0.6882773774808549, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 32772 + }, + { + "epoch": 0.32773, + "grad_norm": 0.7567310844584764, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 32773 + }, + { + "epoch": 0.32774, + "grad_norm": 0.9345205319672601, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 32774 + }, + { + "epoch": 0.32775, + "grad_norm": 1.1168300273513763, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 32775 + }, + { + "epoch": 0.32776, + "grad_norm": 0.9851451871273431, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 32776 + }, + { + "epoch": 0.32777, + "grad_norm": 0.809832888975116, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 32777 + }, + { + "epoch": 0.32778, + "grad_norm": 0.7114239327686078, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 32778 + }, + { + "epoch": 0.32779, + "grad_norm": 0.664959126672734, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 32779 + }, + { + "epoch": 0.3278, + "grad_norm": 0.6551251876227606, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 32780 + }, + { + "epoch": 0.32781, + "grad_norm": 0.6948834932724413, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 32781 + }, + { + "epoch": 0.32782, + "grad_norm": 0.6978697874465555, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 32782 + }, + { + "epoch": 0.32783, + "grad_norm": 0.60805044380455, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 32783 + }, + { + "epoch": 0.32784, + "grad_norm": 0.6184721772228089, + "learning_rate": 0.003, + "loss": 4.0065, + "step": 32784 + }, + { + "epoch": 0.32785, + "grad_norm": 0.6153589703558715, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 32785 + }, + { + "epoch": 0.32786, + "grad_norm": 0.6272600819447444, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 32786 + }, + { + "epoch": 0.32787, + "grad_norm": 0.6548382203752734, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 32787 + }, + { + "epoch": 0.32788, + "grad_norm": 0.7113253310900927, + "learning_rate": 0.003, + "loss": 3.9954, + "step": 32788 + }, + { + "epoch": 0.32789, + "grad_norm": 0.6288826086523921, + "learning_rate": 0.003, + "loss": 4.043, + "step": 32789 + }, + { + "epoch": 0.3279, + "grad_norm": 0.7075308254836788, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 32790 + }, + { + "epoch": 0.32791, + "grad_norm": 0.9090959211105727, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 32791 + }, + { + "epoch": 0.32792, + "grad_norm": 1.1252312258679864, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 32792 + }, + { + "epoch": 0.32793, + "grad_norm": 0.9153018125687011, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 32793 + }, + { + "epoch": 0.32794, + "grad_norm": 0.9145029885204722, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 32794 + }, + { + "epoch": 0.32795, + "grad_norm": 0.8658092573008562, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 32795 + }, + { + "epoch": 0.32796, + "grad_norm": 0.8092578250150014, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 32796 + }, + { + "epoch": 0.32797, + "grad_norm": 0.7603594854822793, + "learning_rate": 0.003, + "loss": 3.9972, + "step": 32797 + }, + { + "epoch": 0.32798, + "grad_norm": 0.7855196408310438, + "learning_rate": 0.003, + "loss": 4.035, + "step": 32798 + }, + { + "epoch": 0.32799, + "grad_norm": 0.8468278100072724, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 32799 + }, + { + "epoch": 0.328, + "grad_norm": 0.8550529747746985, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 32800 + }, + { + "epoch": 0.32801, + "grad_norm": 0.8156413881538058, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 32801 + }, + { + "epoch": 0.32802, + "grad_norm": 0.7842351392087904, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 32802 + }, + { + "epoch": 0.32803, + "grad_norm": 0.8473227112000773, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 32803 + }, + { + "epoch": 0.32804, + "grad_norm": 0.9566836814705664, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 32804 + }, + { + "epoch": 0.32805, + "grad_norm": 1.0496157145740168, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 32805 + }, + { + "epoch": 0.32806, + "grad_norm": 1.0949280779621868, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 32806 + }, + { + "epoch": 0.32807, + "grad_norm": 1.1463239599347805, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 32807 + }, + { + "epoch": 0.32808, + "grad_norm": 1.0031558213689882, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 32808 + }, + { + "epoch": 0.32809, + "grad_norm": 1.013256744128986, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 32809 + }, + { + "epoch": 0.3281, + "grad_norm": 0.8858231018729169, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 32810 + }, + { + "epoch": 0.32811, + "grad_norm": 0.8208672329834767, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 32811 + }, + { + "epoch": 0.32812, + "grad_norm": 0.7790944077576955, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 32812 + }, + { + "epoch": 0.32813, + "grad_norm": 0.7128251901038437, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 32813 + }, + { + "epoch": 0.32814, + "grad_norm": 0.7331569463675273, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 32814 + }, + { + "epoch": 0.32815, + "grad_norm": 0.7246077819154009, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 32815 + }, + { + "epoch": 0.32816, + "grad_norm": 0.7339000471952554, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 32816 + }, + { + "epoch": 0.32817, + "grad_norm": 0.7629191471134752, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 32817 + }, + { + "epoch": 0.32818, + "grad_norm": 0.8787444140475765, + "learning_rate": 0.003, + "loss": 4.0045, + "step": 32818 + }, + { + "epoch": 0.32819, + "grad_norm": 0.9304823197642275, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 32819 + }, + { + "epoch": 0.3282, + "grad_norm": 0.8383766193968717, + "learning_rate": 0.003, + "loss": 4.041, + "step": 32820 + }, + { + "epoch": 0.32821, + "grad_norm": 0.7229392827429199, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 32821 + }, + { + "epoch": 0.32822, + "grad_norm": 0.6221199408339271, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 32822 + }, + { + "epoch": 0.32823, + "grad_norm": 0.6677408195564506, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 32823 + }, + { + "epoch": 0.32824, + "grad_norm": 0.6836661735359091, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 32824 + }, + { + "epoch": 0.32825, + "grad_norm": 0.699900358183799, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 32825 + }, + { + "epoch": 0.32826, + "grad_norm": 0.7801314694170908, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 32826 + }, + { + "epoch": 0.32827, + "grad_norm": 0.8380769409537585, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 32827 + }, + { + "epoch": 0.32828, + "grad_norm": 1.0535551010951132, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 32828 + }, + { + "epoch": 0.32829, + "grad_norm": 1.2305865195060723, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 32829 + }, + { + "epoch": 0.3283, + "grad_norm": 0.7102852956524855, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 32830 + }, + { + "epoch": 0.32831, + "grad_norm": 0.7074637090567749, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 32831 + }, + { + "epoch": 0.32832, + "grad_norm": 0.7594337069188017, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 32832 + }, + { + "epoch": 0.32833, + "grad_norm": 0.7695622679845707, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 32833 + }, + { + "epoch": 0.32834, + "grad_norm": 0.807565039625592, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 32834 + }, + { + "epoch": 0.32835, + "grad_norm": 0.9013615683330852, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 32835 + }, + { + "epoch": 0.32836, + "grad_norm": 1.0746814591792715, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 32836 + }, + { + "epoch": 0.32837, + "grad_norm": 1.0622191416300537, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 32837 + }, + { + "epoch": 0.32838, + "grad_norm": 0.9410174559653293, + "learning_rate": 0.003, + "loss": 4.031, + "step": 32838 + }, + { + "epoch": 0.32839, + "grad_norm": 0.8977356456870198, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 32839 + }, + { + "epoch": 0.3284, + "grad_norm": 0.8150098056799706, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 32840 + }, + { + "epoch": 0.32841, + "grad_norm": 0.7356614590927222, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 32841 + }, + { + "epoch": 0.32842, + "grad_norm": 0.6357212271655129, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 32842 + }, + { + "epoch": 0.32843, + "grad_norm": 0.6973430456971517, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 32843 + }, + { + "epoch": 0.32844, + "grad_norm": 0.7740847534768226, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 32844 + }, + { + "epoch": 0.32845, + "grad_norm": 0.9419775722595225, + "learning_rate": 0.003, + "loss": 4.051, + "step": 32845 + }, + { + "epoch": 0.32846, + "grad_norm": 1.1018000004628943, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 32846 + }, + { + "epoch": 0.32847, + "grad_norm": 0.8494176195289611, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 32847 + }, + { + "epoch": 0.32848, + "grad_norm": 0.6334640000951307, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 32848 + }, + { + "epoch": 0.32849, + "grad_norm": 0.5593367295177553, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 32849 + }, + { + "epoch": 0.3285, + "grad_norm": 0.5195682674158729, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 32850 + }, + { + "epoch": 0.32851, + "grad_norm": 0.47280169689782997, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 32851 + }, + { + "epoch": 0.32852, + "grad_norm": 0.4553147953994516, + "learning_rate": 0.003, + "loss": 3.9964, + "step": 32852 + }, + { + "epoch": 0.32853, + "grad_norm": 0.5241443388032121, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 32853 + }, + { + "epoch": 0.32854, + "grad_norm": 0.6475695789677293, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 32854 + }, + { + "epoch": 0.32855, + "grad_norm": 0.7304299685805818, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 32855 + }, + { + "epoch": 0.32856, + "grad_norm": 0.8377203729794402, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 32856 + }, + { + "epoch": 0.32857, + "grad_norm": 1.0310215721881397, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 32857 + }, + { + "epoch": 0.32858, + "grad_norm": 1.2152538321588076, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 32858 + }, + { + "epoch": 0.32859, + "grad_norm": 0.7291529753919089, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 32859 + }, + { + "epoch": 0.3286, + "grad_norm": 0.8052154872052569, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 32860 + }, + { + "epoch": 0.32861, + "grad_norm": 0.7989826991642558, + "learning_rate": 0.003, + "loss": 3.9882, + "step": 32861 + }, + { + "epoch": 0.32862, + "grad_norm": 0.8289516377325735, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 32862 + }, + { + "epoch": 0.32863, + "grad_norm": 0.8380873165247007, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 32863 + }, + { + "epoch": 0.32864, + "grad_norm": 0.9162193404347158, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 32864 + }, + { + "epoch": 0.32865, + "grad_norm": 1.1517260715962037, + "learning_rate": 0.003, + "loss": 4.062, + "step": 32865 + }, + { + "epoch": 0.32866, + "grad_norm": 1.131957426916179, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 32866 + }, + { + "epoch": 0.32867, + "grad_norm": 1.0244120319104884, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 32867 + }, + { + "epoch": 0.32868, + "grad_norm": 0.8919543457532081, + "learning_rate": 0.003, + "loss": 4.042, + "step": 32868 + }, + { + "epoch": 0.32869, + "grad_norm": 0.9891846221221303, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 32869 + }, + { + "epoch": 0.3287, + "grad_norm": 1.046565328288606, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 32870 + }, + { + "epoch": 0.32871, + "grad_norm": 0.947701273631773, + "learning_rate": 0.003, + "loss": 4.062, + "step": 32871 + }, + { + "epoch": 0.32872, + "grad_norm": 1.0529242828476786, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 32872 + }, + { + "epoch": 0.32873, + "grad_norm": 1.0563269855849862, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 32873 + }, + { + "epoch": 0.32874, + "grad_norm": 1.0806617276353185, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 32874 + }, + { + "epoch": 0.32875, + "grad_norm": 0.9481785810853204, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 32875 + }, + { + "epoch": 0.32876, + "grad_norm": 0.832662466974176, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 32876 + }, + { + "epoch": 0.32877, + "grad_norm": 0.856259294451329, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 32877 + }, + { + "epoch": 0.32878, + "grad_norm": 0.8522547378696904, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 32878 + }, + { + "epoch": 0.32879, + "grad_norm": 0.8535406273683419, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 32879 + }, + { + "epoch": 0.3288, + "grad_norm": 0.9512553732259182, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 32880 + }, + { + "epoch": 0.32881, + "grad_norm": 0.9931018712998594, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 32881 + }, + { + "epoch": 0.32882, + "grad_norm": 1.0299175120463493, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 32882 + }, + { + "epoch": 0.32883, + "grad_norm": 0.9199998764376913, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 32883 + }, + { + "epoch": 0.32884, + "grad_norm": 0.8800868299741419, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 32884 + }, + { + "epoch": 0.32885, + "grad_norm": 1.0026752539669597, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 32885 + }, + { + "epoch": 0.32886, + "grad_norm": 1.282599294948141, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 32886 + }, + { + "epoch": 0.32887, + "grad_norm": 0.7950167391585516, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 32887 + }, + { + "epoch": 0.32888, + "grad_norm": 0.6825322821753992, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 32888 + }, + { + "epoch": 0.32889, + "grad_norm": 0.6783807698471133, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 32889 + }, + { + "epoch": 0.3289, + "grad_norm": 0.5986695208295352, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 32890 + }, + { + "epoch": 0.32891, + "grad_norm": 0.6750465113519022, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 32891 + }, + { + "epoch": 0.32892, + "grad_norm": 0.7778988312711068, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 32892 + }, + { + "epoch": 0.32893, + "grad_norm": 0.8767689743543904, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 32893 + }, + { + "epoch": 0.32894, + "grad_norm": 0.9046482255455346, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 32894 + }, + { + "epoch": 0.32895, + "grad_norm": 0.8832149933866696, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 32895 + }, + { + "epoch": 0.32896, + "grad_norm": 0.7482154066347795, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 32896 + }, + { + "epoch": 0.32897, + "grad_norm": 0.6285002657114515, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 32897 + }, + { + "epoch": 0.32898, + "grad_norm": 0.6061916874614466, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 32898 + }, + { + "epoch": 0.32899, + "grad_norm": 0.5265155990607695, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 32899 + }, + { + "epoch": 0.329, + "grad_norm": 0.45829071943881494, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 32900 + }, + { + "epoch": 0.32901, + "grad_norm": 0.4878095817938297, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 32901 + }, + { + "epoch": 0.32902, + "grad_norm": 0.550599095671545, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 32902 + }, + { + "epoch": 0.32903, + "grad_norm": 0.5836665807120486, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 32903 + }, + { + "epoch": 0.32904, + "grad_norm": 0.770545039221921, + "learning_rate": 0.003, + "loss": 3.9961, + "step": 32904 + }, + { + "epoch": 0.32905, + "grad_norm": 1.099832132281905, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 32905 + }, + { + "epoch": 0.32906, + "grad_norm": 1.1132582469446257, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 32906 + }, + { + "epoch": 0.32907, + "grad_norm": 0.7806003274467955, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 32907 + }, + { + "epoch": 0.32908, + "grad_norm": 0.6640957696895113, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 32908 + }, + { + "epoch": 0.32909, + "grad_norm": 0.7425006316226405, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 32909 + }, + { + "epoch": 0.3291, + "grad_norm": 0.7915511300740715, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 32910 + }, + { + "epoch": 0.32911, + "grad_norm": 0.8840312936087314, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 32911 + }, + { + "epoch": 0.32912, + "grad_norm": 1.021036770325842, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 32912 + }, + { + "epoch": 0.32913, + "grad_norm": 0.9682714857754495, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 32913 + }, + { + "epoch": 0.32914, + "grad_norm": 0.8826008650793474, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 32914 + }, + { + "epoch": 0.32915, + "grad_norm": 0.848089142066145, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 32915 + }, + { + "epoch": 0.32916, + "grad_norm": 0.828881911317961, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 32916 + }, + { + "epoch": 0.32917, + "grad_norm": 0.7775530005306261, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 32917 + }, + { + "epoch": 0.32918, + "grad_norm": 0.9062100218694806, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 32918 + }, + { + "epoch": 0.32919, + "grad_norm": 0.9532959049446321, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 32919 + }, + { + "epoch": 0.3292, + "grad_norm": 1.0415259328889153, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 32920 + }, + { + "epoch": 0.32921, + "grad_norm": 1.068833788514106, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 32921 + }, + { + "epoch": 0.32922, + "grad_norm": 0.8911295654545001, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 32922 + }, + { + "epoch": 0.32923, + "grad_norm": 1.0177446975275957, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 32923 + }, + { + "epoch": 0.32924, + "grad_norm": 1.0747989379074088, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 32924 + }, + { + "epoch": 0.32925, + "grad_norm": 0.9789683598810396, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 32925 + }, + { + "epoch": 0.32926, + "grad_norm": 0.9775972281245453, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 32926 + }, + { + "epoch": 0.32927, + "grad_norm": 1.0653559273687836, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 32927 + }, + { + "epoch": 0.32928, + "grad_norm": 0.8535810462653094, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 32928 + }, + { + "epoch": 0.32929, + "grad_norm": 0.7208469766119024, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 32929 + }, + { + "epoch": 0.3293, + "grad_norm": 0.697632945595685, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 32930 + }, + { + "epoch": 0.32931, + "grad_norm": 0.7403299444381116, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 32931 + }, + { + "epoch": 0.32932, + "grad_norm": 0.8034513289524816, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 32932 + }, + { + "epoch": 0.32933, + "grad_norm": 0.8739940049693071, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 32933 + }, + { + "epoch": 0.32934, + "grad_norm": 0.8868919259366007, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 32934 + }, + { + "epoch": 0.32935, + "grad_norm": 1.0251203297051645, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 32935 + }, + { + "epoch": 0.32936, + "grad_norm": 1.1067275598297208, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 32936 + }, + { + "epoch": 0.32937, + "grad_norm": 0.8650980937687841, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 32937 + }, + { + "epoch": 0.32938, + "grad_norm": 0.8812372770906299, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 32938 + }, + { + "epoch": 0.32939, + "grad_norm": 0.9584707317925654, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 32939 + }, + { + "epoch": 0.3294, + "grad_norm": 0.9956592200115973, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 32940 + }, + { + "epoch": 0.32941, + "grad_norm": 1.154067678980052, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 32941 + }, + { + "epoch": 0.32942, + "grad_norm": 1.0523857857797632, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 32942 + }, + { + "epoch": 0.32943, + "grad_norm": 0.892917706443352, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 32943 + }, + { + "epoch": 0.32944, + "grad_norm": 0.8480201439517843, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 32944 + }, + { + "epoch": 0.32945, + "grad_norm": 0.8357257780472986, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 32945 + }, + { + "epoch": 0.32946, + "grad_norm": 0.8789828001425375, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 32946 + }, + { + "epoch": 0.32947, + "grad_norm": 0.8085879264549248, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 32947 + }, + { + "epoch": 0.32948, + "grad_norm": 0.7198950995932402, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 32948 + }, + { + "epoch": 0.32949, + "grad_norm": 0.7034553120131015, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 32949 + }, + { + "epoch": 0.3295, + "grad_norm": 0.5573712916322202, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 32950 + }, + { + "epoch": 0.32951, + "grad_norm": 0.577346444423763, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 32951 + }, + { + "epoch": 0.32952, + "grad_norm": 0.5405754420085164, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 32952 + }, + { + "epoch": 0.32953, + "grad_norm": 0.5728345638485468, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 32953 + }, + { + "epoch": 0.32954, + "grad_norm": 0.5243170441269865, + "learning_rate": 0.003, + "loss": 3.9928, + "step": 32954 + }, + { + "epoch": 0.32955, + "grad_norm": 0.5388268020406098, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 32955 + }, + { + "epoch": 0.32956, + "grad_norm": 0.6098676786214638, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 32956 + }, + { + "epoch": 0.32957, + "grad_norm": 0.7932129683142589, + "learning_rate": 0.003, + "loss": 4.0069, + "step": 32957 + }, + { + "epoch": 0.32958, + "grad_norm": 1.0465855761079197, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 32958 + }, + { + "epoch": 0.32959, + "grad_norm": 1.1591500231503091, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 32959 + }, + { + "epoch": 0.3296, + "grad_norm": 0.7010389428677801, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 32960 + }, + { + "epoch": 0.32961, + "grad_norm": 0.7108886402635417, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 32961 + }, + { + "epoch": 0.32962, + "grad_norm": 0.951959476885274, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 32962 + }, + { + "epoch": 0.32963, + "grad_norm": 0.9088738633470986, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 32963 + }, + { + "epoch": 0.32964, + "grad_norm": 0.7386644504758165, + "learning_rate": 0.003, + "loss": 4.033, + "step": 32964 + }, + { + "epoch": 0.32965, + "grad_norm": 0.6114699565755193, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 32965 + }, + { + "epoch": 0.32966, + "grad_norm": 0.6235121351844526, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 32966 + }, + { + "epoch": 0.32967, + "grad_norm": 0.6606396564616415, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 32967 + }, + { + "epoch": 0.32968, + "grad_norm": 0.693656386127757, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 32968 + }, + { + "epoch": 0.32969, + "grad_norm": 0.6710489270492578, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 32969 + }, + { + "epoch": 0.3297, + "grad_norm": 0.6419747151612746, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 32970 + }, + { + "epoch": 0.32971, + "grad_norm": 0.728378133345311, + "learning_rate": 0.003, + "loss": 4.039, + "step": 32971 + }, + { + "epoch": 0.32972, + "grad_norm": 0.9399211035940511, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 32972 + }, + { + "epoch": 0.32973, + "grad_norm": 1.2331918347203468, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 32973 + }, + { + "epoch": 0.32974, + "grad_norm": 0.9817697625215905, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 32974 + }, + { + "epoch": 0.32975, + "grad_norm": 0.9762982668494186, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 32975 + }, + { + "epoch": 0.32976, + "grad_norm": 0.9963516397266479, + "learning_rate": 0.003, + "loss": 4.051, + "step": 32976 + }, + { + "epoch": 0.32977, + "grad_norm": 0.9031088791599937, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 32977 + }, + { + "epoch": 0.32978, + "grad_norm": 0.8011083457228265, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 32978 + }, + { + "epoch": 0.32979, + "grad_norm": 0.8664963973162351, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 32979 + }, + { + "epoch": 0.3298, + "grad_norm": 0.8986005786448901, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 32980 + }, + { + "epoch": 0.32981, + "grad_norm": 0.769223951727999, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 32981 + }, + { + "epoch": 0.32982, + "grad_norm": 0.7951305047954368, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 32982 + }, + { + "epoch": 0.32983, + "grad_norm": 0.7589346432436799, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 32983 + }, + { + "epoch": 0.32984, + "grad_norm": 0.7188597246609556, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 32984 + }, + { + "epoch": 0.32985, + "grad_norm": 0.7292978841448944, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 32985 + }, + { + "epoch": 0.32986, + "grad_norm": 0.7463183379443021, + "learning_rate": 0.003, + "loss": 4.019, + "step": 32986 + }, + { + "epoch": 0.32987, + "grad_norm": 0.9889820079680672, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 32987 + }, + { + "epoch": 0.32988, + "grad_norm": 1.4506107242380588, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 32988 + }, + { + "epoch": 0.32989, + "grad_norm": 0.6668609181117161, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 32989 + }, + { + "epoch": 0.3299, + "grad_norm": 0.6709974749042328, + "learning_rate": 0.003, + "loss": 3.987, + "step": 32990 + }, + { + "epoch": 0.32991, + "grad_norm": 0.7136704746114485, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 32991 + }, + { + "epoch": 0.32992, + "grad_norm": 0.8580625543067306, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 32992 + }, + { + "epoch": 0.32993, + "grad_norm": 0.9767770393558276, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 32993 + }, + { + "epoch": 0.32994, + "grad_norm": 0.8938353856851828, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 32994 + }, + { + "epoch": 0.32995, + "grad_norm": 0.7550990979270681, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 32995 + }, + { + "epoch": 0.32996, + "grad_norm": 0.6854508115335984, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 32996 + }, + { + "epoch": 0.32997, + "grad_norm": 0.7790134267869914, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 32997 + }, + { + "epoch": 0.32998, + "grad_norm": 0.8729925345457293, + "learning_rate": 0.003, + "loss": 4.037, + "step": 32998 + }, + { + "epoch": 0.32999, + "grad_norm": 0.932328067434735, + "learning_rate": 0.003, + "loss": 4.048, + "step": 32999 + }, + { + "epoch": 0.33, + "grad_norm": 0.9663453963823336, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 33000 + }, + { + "epoch": 0.33001, + "grad_norm": 1.018007900779193, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 33001 + }, + { + "epoch": 0.33002, + "grad_norm": 1.0447281035563456, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 33002 + }, + { + "epoch": 0.33003, + "grad_norm": 0.8527313736970046, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 33003 + }, + { + "epoch": 0.33004, + "grad_norm": 0.8001387911045483, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 33004 + }, + { + "epoch": 0.33005, + "grad_norm": 0.6883411841498963, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 33005 + }, + { + "epoch": 0.33006, + "grad_norm": 0.5559548292191935, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 33006 + }, + { + "epoch": 0.33007, + "grad_norm": 0.6032598493981243, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 33007 + }, + { + "epoch": 0.33008, + "grad_norm": 0.7358384792324202, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 33008 + }, + { + "epoch": 0.33009, + "grad_norm": 0.8430423008530843, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 33009 + }, + { + "epoch": 0.3301, + "grad_norm": 0.973563549451449, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 33010 + }, + { + "epoch": 0.33011, + "grad_norm": 1.012210197434772, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 33011 + }, + { + "epoch": 0.33012, + "grad_norm": 1.0217254923546066, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 33012 + }, + { + "epoch": 0.33013, + "grad_norm": 1.0398783916194048, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 33013 + }, + { + "epoch": 0.33014, + "grad_norm": 0.9739463734757691, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 33014 + }, + { + "epoch": 0.33015, + "grad_norm": 1.0265481328054102, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 33015 + }, + { + "epoch": 0.33016, + "grad_norm": 1.0334095999935946, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 33016 + }, + { + "epoch": 0.33017, + "grad_norm": 0.9492895189269877, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 33017 + }, + { + "epoch": 0.33018, + "grad_norm": 0.8800428485791957, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 33018 + }, + { + "epoch": 0.33019, + "grad_norm": 0.871331158872836, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 33019 + }, + { + "epoch": 0.3302, + "grad_norm": 0.7675812977087423, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 33020 + }, + { + "epoch": 0.33021, + "grad_norm": 0.7667892611459053, + "learning_rate": 0.003, + "loss": 4.05, + "step": 33021 + }, + { + "epoch": 0.33022, + "grad_norm": 0.7447056052660946, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 33022 + }, + { + "epoch": 0.33023, + "grad_norm": 0.6928891980152762, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 33023 + }, + { + "epoch": 0.33024, + "grad_norm": 0.7068987995862442, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 33024 + }, + { + "epoch": 0.33025, + "grad_norm": 0.696461687536192, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 33025 + }, + { + "epoch": 0.33026, + "grad_norm": 0.7828605854844669, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 33026 + }, + { + "epoch": 0.33027, + "grad_norm": 0.6607250068496198, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 33027 + }, + { + "epoch": 0.33028, + "grad_norm": 0.6199331363031488, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 33028 + }, + { + "epoch": 0.33029, + "grad_norm": 0.6324892754030976, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 33029 + }, + { + "epoch": 0.3303, + "grad_norm": 0.8481556652516239, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 33030 + }, + { + "epoch": 0.33031, + "grad_norm": 1.2606400689978698, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 33031 + }, + { + "epoch": 0.33032, + "grad_norm": 1.011375590370824, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 33032 + }, + { + "epoch": 0.33033, + "grad_norm": 0.9109505432784176, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 33033 + }, + { + "epoch": 0.33034, + "grad_norm": 0.8615735021777486, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 33034 + }, + { + "epoch": 0.33035, + "grad_norm": 0.9595597796898825, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 33035 + }, + { + "epoch": 0.33036, + "grad_norm": 0.9633584486031049, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 33036 + }, + { + "epoch": 0.33037, + "grad_norm": 1.0164412896474802, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 33037 + }, + { + "epoch": 0.33038, + "grad_norm": 1.0021728546952704, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 33038 + }, + { + "epoch": 0.33039, + "grad_norm": 0.9869203671116209, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 33039 + }, + { + "epoch": 0.3304, + "grad_norm": 1.0170191408937705, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 33040 + }, + { + "epoch": 0.33041, + "grad_norm": 1.0080815105241663, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 33041 + }, + { + "epoch": 0.33042, + "grad_norm": 0.9254363443868511, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 33042 + }, + { + "epoch": 0.33043, + "grad_norm": 0.8044571363565541, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 33043 + }, + { + "epoch": 0.33044, + "grad_norm": 0.7563258487917408, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 33044 + }, + { + "epoch": 0.33045, + "grad_norm": 0.757706071134185, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 33045 + }, + { + "epoch": 0.33046, + "grad_norm": 0.8108357816523974, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 33046 + }, + { + "epoch": 0.33047, + "grad_norm": 0.8308322193690573, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 33047 + }, + { + "epoch": 0.33048, + "grad_norm": 0.9739862310875564, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 33048 + }, + { + "epoch": 0.33049, + "grad_norm": 1.1255379964493673, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 33049 + }, + { + "epoch": 0.3305, + "grad_norm": 0.8545191710983772, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 33050 + }, + { + "epoch": 0.33051, + "grad_norm": 0.8375441550345032, + "learning_rate": 0.003, + "loss": 4.0068, + "step": 33051 + }, + { + "epoch": 0.33052, + "grad_norm": 0.742451103499636, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 33052 + }, + { + "epoch": 0.33053, + "grad_norm": 0.7152433887194712, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 33053 + }, + { + "epoch": 0.33054, + "grad_norm": 0.7154163168892614, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 33054 + }, + { + "epoch": 0.33055, + "grad_norm": 0.7283853015195996, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 33055 + }, + { + "epoch": 0.33056, + "grad_norm": 0.659897050774051, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 33056 + }, + { + "epoch": 0.33057, + "grad_norm": 0.7192064488578238, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 33057 + }, + { + "epoch": 0.33058, + "grad_norm": 0.792366103412778, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 33058 + }, + { + "epoch": 0.33059, + "grad_norm": 0.7181267298710002, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 33059 + }, + { + "epoch": 0.3306, + "grad_norm": 0.7835064299380337, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 33060 + }, + { + "epoch": 0.33061, + "grad_norm": 0.8370702922409893, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 33061 + }, + { + "epoch": 0.33062, + "grad_norm": 0.8959150710592624, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 33062 + }, + { + "epoch": 0.33063, + "grad_norm": 0.9958039848835833, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 33063 + }, + { + "epoch": 0.33064, + "grad_norm": 1.119185680312841, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 33064 + }, + { + "epoch": 0.33065, + "grad_norm": 0.8006562907848775, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 33065 + }, + { + "epoch": 0.33066, + "grad_norm": 0.6456812088251573, + "learning_rate": 0.003, + "loss": 4.0043, + "step": 33066 + }, + { + "epoch": 0.33067, + "grad_norm": 0.5823895886695795, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 33067 + }, + { + "epoch": 0.33068, + "grad_norm": 0.5718903017268412, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 33068 + }, + { + "epoch": 0.33069, + "grad_norm": 0.6596837769411772, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 33069 + }, + { + "epoch": 0.3307, + "grad_norm": 0.655389635099349, + "learning_rate": 0.003, + "loss": 4.0059, + "step": 33070 + }, + { + "epoch": 0.33071, + "grad_norm": 0.703676628435894, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 33071 + }, + { + "epoch": 0.33072, + "grad_norm": 0.8001275423349887, + "learning_rate": 0.003, + "loss": 4.072, + "step": 33072 + }, + { + "epoch": 0.33073, + "grad_norm": 0.855745444626138, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 33073 + }, + { + "epoch": 0.33074, + "grad_norm": 0.8645823316164157, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 33074 + }, + { + "epoch": 0.33075, + "grad_norm": 0.8974268695616503, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 33075 + }, + { + "epoch": 0.33076, + "grad_norm": 0.9130947547224789, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 33076 + }, + { + "epoch": 0.33077, + "grad_norm": 0.9699312695158779, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 33077 + }, + { + "epoch": 0.33078, + "grad_norm": 1.0361845788058084, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 33078 + }, + { + "epoch": 0.33079, + "grad_norm": 0.769096018504092, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 33079 + }, + { + "epoch": 0.3308, + "grad_norm": 0.7785444935317384, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 33080 + }, + { + "epoch": 0.33081, + "grad_norm": 0.8867992983069425, + "learning_rate": 0.003, + "loss": 4.025, + "step": 33081 + }, + { + "epoch": 0.33082, + "grad_norm": 1.0243221566467258, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 33082 + }, + { + "epoch": 0.33083, + "grad_norm": 1.2922101771469248, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 33083 + }, + { + "epoch": 0.33084, + "grad_norm": 0.8145206756566985, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 33084 + }, + { + "epoch": 0.33085, + "grad_norm": 0.6909658975921286, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 33085 + }, + { + "epoch": 0.33086, + "grad_norm": 0.6193606999604935, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 33086 + }, + { + "epoch": 0.33087, + "grad_norm": 0.5335183408392776, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 33087 + }, + { + "epoch": 0.33088, + "grad_norm": 0.5865069493072997, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 33088 + }, + { + "epoch": 0.33089, + "grad_norm": 0.5060344461536281, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 33089 + }, + { + "epoch": 0.3309, + "grad_norm": 0.5807368883608437, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 33090 + }, + { + "epoch": 0.33091, + "grad_norm": 0.6896235885646109, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 33091 + }, + { + "epoch": 0.33092, + "grad_norm": 0.9329965005808539, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 33092 + }, + { + "epoch": 0.33093, + "grad_norm": 1.154156465236188, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 33093 + }, + { + "epoch": 0.33094, + "grad_norm": 0.866038632102812, + "learning_rate": 0.003, + "loss": 4.049, + "step": 33094 + }, + { + "epoch": 0.33095, + "grad_norm": 0.785540574737168, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 33095 + }, + { + "epoch": 0.33096, + "grad_norm": 0.6803720498531979, + "learning_rate": 0.003, + "loss": 4.0037, + "step": 33096 + }, + { + "epoch": 0.33097, + "grad_norm": 0.7272363362049685, + "learning_rate": 0.003, + "loss": 4.049, + "step": 33097 + }, + { + "epoch": 0.33098, + "grad_norm": 0.9686816138095062, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 33098 + }, + { + "epoch": 0.33099, + "grad_norm": 1.3058478638130415, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 33099 + }, + { + "epoch": 0.331, + "grad_norm": 0.771521968074124, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 33100 + }, + { + "epoch": 0.33101, + "grad_norm": 0.8544438415217132, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 33101 + }, + { + "epoch": 0.33102, + "grad_norm": 0.952976672153501, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 33102 + }, + { + "epoch": 0.33103, + "grad_norm": 0.9871427216098628, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 33103 + }, + { + "epoch": 0.33104, + "grad_norm": 0.9801996292997968, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 33104 + }, + { + "epoch": 0.33105, + "grad_norm": 0.9708329845872995, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 33105 + }, + { + "epoch": 0.33106, + "grad_norm": 0.821149343524782, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 33106 + }, + { + "epoch": 0.33107, + "grad_norm": 0.7082866938168073, + "learning_rate": 0.003, + "loss": 3.9984, + "step": 33107 + }, + { + "epoch": 0.33108, + "grad_norm": 0.6877610431743001, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 33108 + }, + { + "epoch": 0.33109, + "grad_norm": 0.7250056190505494, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 33109 + }, + { + "epoch": 0.3311, + "grad_norm": 0.7617119381085308, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 33110 + }, + { + "epoch": 0.33111, + "grad_norm": 0.8147962746434236, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 33111 + }, + { + "epoch": 0.33112, + "grad_norm": 0.9785031790878241, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 33112 + }, + { + "epoch": 0.33113, + "grad_norm": 1.1068639573230046, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 33113 + }, + { + "epoch": 0.33114, + "grad_norm": 1.2158786828016956, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 33114 + }, + { + "epoch": 0.33115, + "grad_norm": 1.0110611777529348, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 33115 + }, + { + "epoch": 0.33116, + "grad_norm": 0.9725810963220475, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 33116 + }, + { + "epoch": 0.33117, + "grad_norm": 0.9228978651730604, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 33117 + }, + { + "epoch": 0.33118, + "grad_norm": 1.010121431585825, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 33118 + }, + { + "epoch": 0.33119, + "grad_norm": 0.9969489418222564, + "learning_rate": 0.003, + "loss": 4.042, + "step": 33119 + }, + { + "epoch": 0.3312, + "grad_norm": 0.9018544564375202, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 33120 + }, + { + "epoch": 0.33121, + "grad_norm": 0.7792230044935711, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 33121 + }, + { + "epoch": 0.33122, + "grad_norm": 0.841664313337199, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 33122 + }, + { + "epoch": 0.33123, + "grad_norm": 0.808296212257901, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 33123 + }, + { + "epoch": 0.33124, + "grad_norm": 0.9064389374291256, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 33124 + }, + { + "epoch": 0.33125, + "grad_norm": 0.973353388930588, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 33125 + }, + { + "epoch": 0.33126, + "grad_norm": 1.1501809353623658, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 33126 + }, + { + "epoch": 0.33127, + "grad_norm": 0.775657586989361, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 33127 + }, + { + "epoch": 0.33128, + "grad_norm": 0.6806491392147505, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 33128 + }, + { + "epoch": 0.33129, + "grad_norm": 0.6717898991498761, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 33129 + }, + { + "epoch": 0.3313, + "grad_norm": 0.6871434820552076, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 33130 + }, + { + "epoch": 0.33131, + "grad_norm": 0.6231609134500362, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 33131 + }, + { + "epoch": 0.33132, + "grad_norm": 0.6233974829398137, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 33132 + }, + { + "epoch": 0.33133, + "grad_norm": 0.6557001004377007, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 33133 + }, + { + "epoch": 0.33134, + "grad_norm": 0.685775476261344, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 33134 + }, + { + "epoch": 0.33135, + "grad_norm": 0.6178724321201882, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 33135 + }, + { + "epoch": 0.33136, + "grad_norm": 0.7159426956464867, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 33136 + }, + { + "epoch": 0.33137, + "grad_norm": 0.7179011813900265, + "learning_rate": 0.003, + "loss": 4.044, + "step": 33137 + }, + { + "epoch": 0.33138, + "grad_norm": 0.5801292386739222, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 33138 + }, + { + "epoch": 0.33139, + "grad_norm": 0.6105317048676393, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 33139 + }, + { + "epoch": 0.3314, + "grad_norm": 0.5311100333000499, + "learning_rate": 0.003, + "loss": 4.0014, + "step": 33140 + }, + { + "epoch": 0.33141, + "grad_norm": 0.5039277320792148, + "learning_rate": 0.003, + "loss": 3.9993, + "step": 33141 + }, + { + "epoch": 0.33142, + "grad_norm": 0.6694178679194572, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 33142 + }, + { + "epoch": 0.33143, + "grad_norm": 0.9795044168635224, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 33143 + }, + { + "epoch": 0.33144, + "grad_norm": 1.3677241467029255, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 33144 + }, + { + "epoch": 0.33145, + "grad_norm": 0.7125268383919716, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 33145 + }, + { + "epoch": 0.33146, + "grad_norm": 0.6358488337253375, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 33146 + }, + { + "epoch": 0.33147, + "grad_norm": 0.6158784593054026, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 33147 + }, + { + "epoch": 0.33148, + "grad_norm": 0.6391827628613184, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 33148 + }, + { + "epoch": 0.33149, + "grad_norm": 0.6346361012793894, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 33149 + }, + { + "epoch": 0.3315, + "grad_norm": 0.6703251503146196, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 33150 + }, + { + "epoch": 0.33151, + "grad_norm": 0.794634277932375, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 33151 + }, + { + "epoch": 0.33152, + "grad_norm": 0.9779880063553521, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 33152 + }, + { + "epoch": 0.33153, + "grad_norm": 1.1408041544016274, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 33153 + }, + { + "epoch": 0.33154, + "grad_norm": 1.0407384773485815, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 33154 + }, + { + "epoch": 0.33155, + "grad_norm": 1.05828276740499, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 33155 + }, + { + "epoch": 0.33156, + "grad_norm": 1.1083851749660065, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 33156 + }, + { + "epoch": 0.33157, + "grad_norm": 1.1751394921204286, + "learning_rate": 0.003, + "loss": 4.063, + "step": 33157 + }, + { + "epoch": 0.33158, + "grad_norm": 0.978994786091527, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 33158 + }, + { + "epoch": 0.33159, + "grad_norm": 1.0148970496697896, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 33159 + }, + { + "epoch": 0.3316, + "grad_norm": 1.143837045021433, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 33160 + }, + { + "epoch": 0.33161, + "grad_norm": 0.9839010544144808, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 33161 + }, + { + "epoch": 0.33162, + "grad_norm": 0.8699860386808788, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 33162 + }, + { + "epoch": 0.33163, + "grad_norm": 0.9067678402218485, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 33163 + }, + { + "epoch": 0.33164, + "grad_norm": 0.8762856068963026, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 33164 + }, + { + "epoch": 0.33165, + "grad_norm": 0.9139473516750833, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 33165 + }, + { + "epoch": 0.33166, + "grad_norm": 1.0157947061219308, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 33166 + }, + { + "epoch": 0.33167, + "grad_norm": 1.0236508851358188, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 33167 + }, + { + "epoch": 0.33168, + "grad_norm": 0.913101622726206, + "learning_rate": 0.003, + "loss": 4.039, + "step": 33168 + }, + { + "epoch": 0.33169, + "grad_norm": 0.8638677351200966, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 33169 + }, + { + "epoch": 0.3317, + "grad_norm": 0.8149921862438072, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 33170 + }, + { + "epoch": 0.33171, + "grad_norm": 0.8489259110360203, + "learning_rate": 0.003, + "loss": 4.071, + "step": 33171 + }, + { + "epoch": 0.33172, + "grad_norm": 0.8860259291597015, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 33172 + }, + { + "epoch": 0.33173, + "grad_norm": 0.7694409305878398, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 33173 + }, + { + "epoch": 0.33174, + "grad_norm": 0.7055082842834273, + "learning_rate": 0.003, + "loss": 4.0006, + "step": 33174 + }, + { + "epoch": 0.33175, + "grad_norm": 0.8786300605427857, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 33175 + }, + { + "epoch": 0.33176, + "grad_norm": 1.067912465386333, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 33176 + }, + { + "epoch": 0.33177, + "grad_norm": 1.1822002781175989, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 33177 + }, + { + "epoch": 0.33178, + "grad_norm": 0.8500687705564448, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 33178 + }, + { + "epoch": 0.33179, + "grad_norm": 0.6073383634073098, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 33179 + }, + { + "epoch": 0.3318, + "grad_norm": 0.556474211126253, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 33180 + }, + { + "epoch": 0.33181, + "grad_norm": 0.5157811920398366, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 33181 + }, + { + "epoch": 0.33182, + "grad_norm": 0.5392580123634528, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 33182 + }, + { + "epoch": 0.33183, + "grad_norm": 0.5203498903468842, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 33183 + }, + { + "epoch": 0.33184, + "grad_norm": 0.5033203681131596, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 33184 + }, + { + "epoch": 0.33185, + "grad_norm": 0.5151286232331618, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 33185 + }, + { + "epoch": 0.33186, + "grad_norm": 0.5526939922818893, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 33186 + }, + { + "epoch": 0.33187, + "grad_norm": 0.5906380280286542, + "learning_rate": 0.003, + "loss": 4.0022, + "step": 33187 + }, + { + "epoch": 0.33188, + "grad_norm": 0.5922046905994872, + "learning_rate": 0.003, + "loss": 4.019, + "step": 33188 + }, + { + "epoch": 0.33189, + "grad_norm": 0.5981129760485763, + "learning_rate": 0.003, + "loss": 4.02, + "step": 33189 + }, + { + "epoch": 0.3319, + "grad_norm": 0.7133378837268015, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 33190 + }, + { + "epoch": 0.33191, + "grad_norm": 1.003276936433062, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 33191 + }, + { + "epoch": 0.33192, + "grad_norm": 1.2206001492506104, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 33192 + }, + { + "epoch": 0.33193, + "grad_norm": 0.8462541503113888, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 33193 + }, + { + "epoch": 0.33194, + "grad_norm": 0.852221025693233, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 33194 + }, + { + "epoch": 0.33195, + "grad_norm": 0.7874589015617214, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 33195 + }, + { + "epoch": 0.33196, + "grad_norm": 0.8011800875675269, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 33196 + }, + { + "epoch": 0.33197, + "grad_norm": 0.8434308582523594, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 33197 + }, + { + "epoch": 0.33198, + "grad_norm": 0.803477282363193, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 33198 + }, + { + "epoch": 0.33199, + "grad_norm": 0.9037528117123197, + "learning_rate": 0.003, + "loss": 3.9853, + "step": 33199 + }, + { + "epoch": 0.332, + "grad_norm": 1.1962474299828751, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 33200 + }, + { + "epoch": 0.33201, + "grad_norm": 1.0264870204204917, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 33201 + }, + { + "epoch": 0.33202, + "grad_norm": 0.9054558366136043, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 33202 + }, + { + "epoch": 0.33203, + "grad_norm": 0.830156389591633, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 33203 + }, + { + "epoch": 0.33204, + "grad_norm": 0.7647726542188915, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 33204 + }, + { + "epoch": 0.33205, + "grad_norm": 0.8167655059898276, + "learning_rate": 0.003, + "loss": 4.042, + "step": 33205 + }, + { + "epoch": 0.33206, + "grad_norm": 0.8796589046459141, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 33206 + }, + { + "epoch": 0.33207, + "grad_norm": 0.8290609837601434, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 33207 + }, + { + "epoch": 0.33208, + "grad_norm": 0.9685632306696346, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 33208 + }, + { + "epoch": 0.33209, + "grad_norm": 0.9621543561253598, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 33209 + }, + { + "epoch": 0.3321, + "grad_norm": 0.9415134862184922, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 33210 + }, + { + "epoch": 0.33211, + "grad_norm": 0.8546166668341312, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 33211 + }, + { + "epoch": 0.33212, + "grad_norm": 0.899318427654502, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 33212 + }, + { + "epoch": 0.33213, + "grad_norm": 0.9486448304796464, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 33213 + }, + { + "epoch": 0.33214, + "grad_norm": 1.0257705673695832, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 33214 + }, + { + "epoch": 0.33215, + "grad_norm": 0.9166575066269033, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 33215 + }, + { + "epoch": 0.33216, + "grad_norm": 0.8692365918909141, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 33216 + }, + { + "epoch": 0.33217, + "grad_norm": 0.8684166946030597, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 33217 + }, + { + "epoch": 0.33218, + "grad_norm": 0.9223398422329775, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 33218 + }, + { + "epoch": 0.33219, + "grad_norm": 0.7889970908158304, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 33219 + }, + { + "epoch": 0.3322, + "grad_norm": 0.6461976327828892, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 33220 + }, + { + "epoch": 0.33221, + "grad_norm": 0.7413033729665723, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 33221 + }, + { + "epoch": 0.33222, + "grad_norm": 0.7728775271150758, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 33222 + }, + { + "epoch": 0.33223, + "grad_norm": 0.7915593898506665, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 33223 + }, + { + "epoch": 0.33224, + "grad_norm": 0.9002962633302, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 33224 + }, + { + "epoch": 0.33225, + "grad_norm": 1.0952031138890588, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 33225 + }, + { + "epoch": 0.33226, + "grad_norm": 1.1001129997323829, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 33226 + }, + { + "epoch": 0.33227, + "grad_norm": 0.9245671908895527, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 33227 + }, + { + "epoch": 0.33228, + "grad_norm": 0.8507626729736291, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 33228 + }, + { + "epoch": 0.33229, + "grad_norm": 0.746898912404189, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 33229 + }, + { + "epoch": 0.3323, + "grad_norm": 0.7182759805047383, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 33230 + }, + { + "epoch": 0.33231, + "grad_norm": 0.7082838067710381, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 33231 + }, + { + "epoch": 0.33232, + "grad_norm": 0.7044310895173217, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 33232 + }, + { + "epoch": 0.33233, + "grad_norm": 0.6600811207065008, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 33233 + }, + { + "epoch": 0.33234, + "grad_norm": 0.8632965392303131, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 33234 + }, + { + "epoch": 0.33235, + "grad_norm": 1.0105144234583896, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 33235 + }, + { + "epoch": 0.33236, + "grad_norm": 1.0442538420908853, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 33236 + }, + { + "epoch": 0.33237, + "grad_norm": 1.1419494112937287, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 33237 + }, + { + "epoch": 0.33238, + "grad_norm": 0.8746545562793311, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 33238 + }, + { + "epoch": 0.33239, + "grad_norm": 0.6912659951647822, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 33239 + }, + { + "epoch": 0.3324, + "grad_norm": 0.7139630083498203, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 33240 + }, + { + "epoch": 0.33241, + "grad_norm": 0.7064651915937736, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 33241 + }, + { + "epoch": 0.33242, + "grad_norm": 0.6440441697946089, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 33242 + }, + { + "epoch": 0.33243, + "grad_norm": 0.7330661626697833, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 33243 + }, + { + "epoch": 0.33244, + "grad_norm": 0.9474671528011014, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 33244 + }, + { + "epoch": 0.33245, + "grad_norm": 1.0071675675059995, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 33245 + }, + { + "epoch": 0.33246, + "grad_norm": 1.007942697562821, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 33246 + }, + { + "epoch": 0.33247, + "grad_norm": 0.9313910465991228, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 33247 + }, + { + "epoch": 0.33248, + "grad_norm": 0.8995941426652255, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 33248 + }, + { + "epoch": 0.33249, + "grad_norm": 0.697466525200447, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 33249 + }, + { + "epoch": 0.3325, + "grad_norm": 0.6837103290497701, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 33250 + }, + { + "epoch": 0.33251, + "grad_norm": 0.7751673571675906, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 33251 + }, + { + "epoch": 0.33252, + "grad_norm": 0.8000229901195006, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 33252 + }, + { + "epoch": 0.33253, + "grad_norm": 0.8093074999819367, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 33253 + }, + { + "epoch": 0.33254, + "grad_norm": 0.9550229714314603, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 33254 + }, + { + "epoch": 0.33255, + "grad_norm": 1.2028363638500297, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 33255 + }, + { + "epoch": 0.33256, + "grad_norm": 0.9043159725479395, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 33256 + }, + { + "epoch": 0.33257, + "grad_norm": 0.7768335505233628, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 33257 + }, + { + "epoch": 0.33258, + "grad_norm": 0.8372921255509228, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 33258 + }, + { + "epoch": 0.33259, + "grad_norm": 0.7534945536167506, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 33259 + }, + { + "epoch": 0.3326, + "grad_norm": 0.6585923088851325, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 33260 + }, + { + "epoch": 0.33261, + "grad_norm": 0.7914267420327626, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 33261 + }, + { + "epoch": 0.33262, + "grad_norm": 0.7434969548860024, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 33262 + }, + { + "epoch": 0.33263, + "grad_norm": 0.804755511238947, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 33263 + }, + { + "epoch": 0.33264, + "grad_norm": 0.765303934071197, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 33264 + }, + { + "epoch": 0.33265, + "grad_norm": 0.7241109521896134, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 33265 + }, + { + "epoch": 0.33266, + "grad_norm": 0.7321214805645182, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 33266 + }, + { + "epoch": 0.33267, + "grad_norm": 0.8877356925801678, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 33267 + }, + { + "epoch": 0.33268, + "grad_norm": 0.9061846128008729, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 33268 + }, + { + "epoch": 0.33269, + "grad_norm": 0.8622369113085281, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 33269 + }, + { + "epoch": 0.3327, + "grad_norm": 0.9063633656509638, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 33270 + }, + { + "epoch": 0.33271, + "grad_norm": 0.8999371861705816, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 33271 + }, + { + "epoch": 0.33272, + "grad_norm": 1.228776114901829, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 33272 + }, + { + "epoch": 0.33273, + "grad_norm": 0.9363687513479486, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 33273 + }, + { + "epoch": 0.33274, + "grad_norm": 0.8655539261671985, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 33274 + }, + { + "epoch": 0.33275, + "grad_norm": 0.9536726369018483, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 33275 + }, + { + "epoch": 0.33276, + "grad_norm": 0.9990851726069606, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 33276 + }, + { + "epoch": 0.33277, + "grad_norm": 0.8819330944187551, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 33277 + }, + { + "epoch": 0.33278, + "grad_norm": 0.8471979831515771, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 33278 + }, + { + "epoch": 0.33279, + "grad_norm": 0.72759799486653, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 33279 + }, + { + "epoch": 0.3328, + "grad_norm": 0.6667116586478672, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 33280 + }, + { + "epoch": 0.33281, + "grad_norm": 0.6547937082267524, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 33281 + }, + { + "epoch": 0.33282, + "grad_norm": 0.7905092865454293, + "learning_rate": 0.003, + "loss": 4.033, + "step": 33282 + }, + { + "epoch": 0.33283, + "grad_norm": 0.9274193839187903, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 33283 + }, + { + "epoch": 0.33284, + "grad_norm": 1.227190317388163, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 33284 + }, + { + "epoch": 0.33285, + "grad_norm": 0.8329195956922195, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 33285 + }, + { + "epoch": 0.33286, + "grad_norm": 0.7998024712539566, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 33286 + }, + { + "epoch": 0.33287, + "grad_norm": 0.9018474629170474, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 33287 + }, + { + "epoch": 0.33288, + "grad_norm": 0.958960561810725, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 33288 + }, + { + "epoch": 0.33289, + "grad_norm": 0.9298418415462505, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 33289 + }, + { + "epoch": 0.3329, + "grad_norm": 0.9298921624866663, + "learning_rate": 0.003, + "loss": 4.03, + "step": 33290 + }, + { + "epoch": 0.33291, + "grad_norm": 0.9843132733638666, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 33291 + }, + { + "epoch": 0.33292, + "grad_norm": 0.9477196777456502, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 33292 + }, + { + "epoch": 0.33293, + "grad_norm": 0.88624817612436, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 33293 + }, + { + "epoch": 0.33294, + "grad_norm": 0.8629305348558095, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 33294 + }, + { + "epoch": 0.33295, + "grad_norm": 0.9465481358434882, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 33295 + }, + { + "epoch": 0.33296, + "grad_norm": 1.0069659006708835, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 33296 + }, + { + "epoch": 0.33297, + "grad_norm": 1.0616510217805197, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 33297 + }, + { + "epoch": 0.33298, + "grad_norm": 0.7555026255764139, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 33298 + }, + { + "epoch": 0.33299, + "grad_norm": 0.7060965566417634, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 33299 + }, + { + "epoch": 0.333, + "grad_norm": 0.7717905322659935, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 33300 + }, + { + "epoch": 0.33301, + "grad_norm": 0.8272922601497754, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 33301 + }, + { + "epoch": 0.33302, + "grad_norm": 0.8136546518762648, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 33302 + }, + { + "epoch": 0.33303, + "grad_norm": 0.7286741488636451, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 33303 + }, + { + "epoch": 0.33304, + "grad_norm": 0.819613187146706, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 33304 + }, + { + "epoch": 0.33305, + "grad_norm": 0.9303796765665868, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 33305 + }, + { + "epoch": 0.33306, + "grad_norm": 1.0074278004664337, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 33306 + }, + { + "epoch": 0.33307, + "grad_norm": 1.0137661350047185, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 33307 + }, + { + "epoch": 0.33308, + "grad_norm": 0.9323007523893484, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 33308 + }, + { + "epoch": 0.33309, + "grad_norm": 0.9286932282686795, + "learning_rate": 0.003, + "loss": 4.065, + "step": 33309 + }, + { + "epoch": 0.3331, + "grad_norm": 0.770354753097008, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 33310 + }, + { + "epoch": 0.33311, + "grad_norm": 0.7382329705374477, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 33311 + }, + { + "epoch": 0.33312, + "grad_norm": 0.6238322085646718, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 33312 + }, + { + "epoch": 0.33313, + "grad_norm": 0.5526982004910219, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 33313 + }, + { + "epoch": 0.33314, + "grad_norm": 0.5792018822135384, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 33314 + }, + { + "epoch": 0.33315, + "grad_norm": 0.7205738865495327, + "learning_rate": 0.003, + "loss": 4.0087, + "step": 33315 + }, + { + "epoch": 0.33316, + "grad_norm": 0.9604716331512959, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 33316 + }, + { + "epoch": 0.33317, + "grad_norm": 1.3073509809583759, + "learning_rate": 0.003, + "loss": 4.016, + "step": 33317 + }, + { + "epoch": 0.33318, + "grad_norm": 0.58850028006163, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 33318 + }, + { + "epoch": 0.33319, + "grad_norm": 0.7160154606202531, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 33319 + }, + { + "epoch": 0.3332, + "grad_norm": 1.048946026528417, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 33320 + }, + { + "epoch": 0.33321, + "grad_norm": 0.9174982722954346, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 33321 + }, + { + "epoch": 0.33322, + "grad_norm": 0.7553982053411437, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 33322 + }, + { + "epoch": 0.33323, + "grad_norm": 0.7150546012251735, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 33323 + }, + { + "epoch": 0.33324, + "grad_norm": 0.7102766500107569, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 33324 + }, + { + "epoch": 0.33325, + "grad_norm": 0.7245684909474662, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 33325 + }, + { + "epoch": 0.33326, + "grad_norm": 0.7490610560449205, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 33326 + }, + { + "epoch": 0.33327, + "grad_norm": 0.7738135572717182, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 33327 + }, + { + "epoch": 0.33328, + "grad_norm": 0.7514528703458535, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 33328 + }, + { + "epoch": 0.33329, + "grad_norm": 0.677799317624192, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 33329 + }, + { + "epoch": 0.3333, + "grad_norm": 0.7186352508241046, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 33330 + }, + { + "epoch": 0.33331, + "grad_norm": 0.6853635653553354, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 33331 + }, + { + "epoch": 0.33332, + "grad_norm": 0.7718452823161797, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 33332 + }, + { + "epoch": 0.33333, + "grad_norm": 0.8624313771085232, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 33333 + }, + { + "epoch": 0.33334, + "grad_norm": 0.9720975979391973, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 33334 + }, + { + "epoch": 0.33335, + "grad_norm": 0.9846330486783315, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 33335 + }, + { + "epoch": 0.33336, + "grad_norm": 1.0076436789217027, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 33336 + }, + { + "epoch": 0.33337, + "grad_norm": 1.0401266000765022, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 33337 + }, + { + "epoch": 0.33338, + "grad_norm": 1.003350629312075, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 33338 + }, + { + "epoch": 0.33339, + "grad_norm": 0.9959680916579017, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 33339 + }, + { + "epoch": 0.3334, + "grad_norm": 0.9917617045813754, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 33340 + }, + { + "epoch": 0.33341, + "grad_norm": 0.8043459385303889, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 33341 + }, + { + "epoch": 0.33342, + "grad_norm": 0.8033768050217123, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 33342 + }, + { + "epoch": 0.33343, + "grad_norm": 0.7882052647208783, + "learning_rate": 0.003, + "loss": 4.065, + "step": 33343 + }, + { + "epoch": 0.33344, + "grad_norm": 0.900428299723834, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 33344 + }, + { + "epoch": 0.33345, + "grad_norm": 1.0307832153563465, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 33345 + }, + { + "epoch": 0.33346, + "grad_norm": 0.996884809073134, + "learning_rate": 0.003, + "loss": 4.034, + "step": 33346 + }, + { + "epoch": 0.33347, + "grad_norm": 0.9950379967464209, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 33347 + }, + { + "epoch": 0.33348, + "grad_norm": 0.9314530625794599, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 33348 + }, + { + "epoch": 0.33349, + "grad_norm": 0.9084031525776701, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 33349 + }, + { + "epoch": 0.3335, + "grad_norm": 0.8986614036959937, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 33350 + }, + { + "epoch": 0.33351, + "grad_norm": 0.8849763114539304, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 33351 + }, + { + "epoch": 0.33352, + "grad_norm": 0.7889523547667419, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 33352 + }, + { + "epoch": 0.33353, + "grad_norm": 0.8210755439825939, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 33353 + }, + { + "epoch": 0.33354, + "grad_norm": 0.7900765764962026, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 33354 + }, + { + "epoch": 0.33355, + "grad_norm": 0.81944029430372, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 33355 + }, + { + "epoch": 0.33356, + "grad_norm": 0.9833580940704708, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 33356 + }, + { + "epoch": 0.33357, + "grad_norm": 1.106402193024358, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 33357 + }, + { + "epoch": 0.33358, + "grad_norm": 0.9229854824922316, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 33358 + }, + { + "epoch": 0.33359, + "grad_norm": 0.8627374269215073, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 33359 + }, + { + "epoch": 0.3336, + "grad_norm": 0.9058011091454641, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 33360 + }, + { + "epoch": 0.33361, + "grad_norm": 0.8426100277198227, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 33361 + }, + { + "epoch": 0.33362, + "grad_norm": 0.7306558176093666, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 33362 + }, + { + "epoch": 0.33363, + "grad_norm": 0.5835573813882036, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 33363 + }, + { + "epoch": 0.33364, + "grad_norm": 0.6175814669265302, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 33364 + }, + { + "epoch": 0.33365, + "grad_norm": 0.6710901286711127, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 33365 + }, + { + "epoch": 0.33366, + "grad_norm": 0.7269772762767898, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 33366 + }, + { + "epoch": 0.33367, + "grad_norm": 0.785864206367794, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 33367 + }, + { + "epoch": 0.33368, + "grad_norm": 0.8822439158355975, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 33368 + }, + { + "epoch": 0.33369, + "grad_norm": 0.9537205744201658, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 33369 + }, + { + "epoch": 0.3337, + "grad_norm": 0.9235509431199971, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 33370 + }, + { + "epoch": 0.33371, + "grad_norm": 0.7092882713777754, + "learning_rate": 0.003, + "loss": 4.044, + "step": 33371 + }, + { + "epoch": 0.33372, + "grad_norm": 0.7136674515729304, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 33372 + }, + { + "epoch": 0.33373, + "grad_norm": 0.692342493707749, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 33373 + }, + { + "epoch": 0.33374, + "grad_norm": 0.6805295975417623, + "learning_rate": 0.003, + "loss": 3.9976, + "step": 33374 + }, + { + "epoch": 0.33375, + "grad_norm": 0.7509553086876447, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 33375 + }, + { + "epoch": 0.33376, + "grad_norm": 0.8650374080244768, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 33376 + }, + { + "epoch": 0.33377, + "grad_norm": 0.9550921190549287, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 33377 + }, + { + "epoch": 0.33378, + "grad_norm": 0.9708827917983992, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 33378 + }, + { + "epoch": 0.33379, + "grad_norm": 0.9142487141191915, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 33379 + }, + { + "epoch": 0.3338, + "grad_norm": 0.9202660257083384, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 33380 + }, + { + "epoch": 0.33381, + "grad_norm": 0.7874397603158287, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 33381 + }, + { + "epoch": 0.33382, + "grad_norm": 0.6403194995470689, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 33382 + }, + { + "epoch": 0.33383, + "grad_norm": 0.6612253456899041, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 33383 + }, + { + "epoch": 0.33384, + "grad_norm": 0.834622769228584, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 33384 + }, + { + "epoch": 0.33385, + "grad_norm": 0.9384407847816049, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 33385 + }, + { + "epoch": 0.33386, + "grad_norm": 1.0372367610519904, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 33386 + }, + { + "epoch": 0.33387, + "grad_norm": 1.032922372300866, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 33387 + }, + { + "epoch": 0.33388, + "grad_norm": 0.8444379457647426, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 33388 + }, + { + "epoch": 0.33389, + "grad_norm": 0.6976530969223596, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 33389 + }, + { + "epoch": 0.3339, + "grad_norm": 0.6399301773457566, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 33390 + }, + { + "epoch": 0.33391, + "grad_norm": 0.6136997134599194, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 33391 + }, + { + "epoch": 0.33392, + "grad_norm": 0.6241371902268488, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 33392 + }, + { + "epoch": 0.33393, + "grad_norm": 0.6868410692337971, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 33393 + }, + { + "epoch": 0.33394, + "grad_norm": 0.7330244499953732, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 33394 + }, + { + "epoch": 0.33395, + "grad_norm": 0.8186830934409803, + "learning_rate": 0.003, + "loss": 4.046, + "step": 33395 + }, + { + "epoch": 0.33396, + "grad_norm": 1.0114948544369764, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 33396 + }, + { + "epoch": 0.33397, + "grad_norm": 1.085815198855781, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 33397 + }, + { + "epoch": 0.33398, + "grad_norm": 0.9430671854639365, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 33398 + }, + { + "epoch": 0.33399, + "grad_norm": 0.9505432899801365, + "learning_rate": 0.003, + "loss": 4.048, + "step": 33399 + }, + { + "epoch": 0.334, + "grad_norm": 0.898644859300184, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 33400 + }, + { + "epoch": 0.33401, + "grad_norm": 0.8016723185043879, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 33401 + }, + { + "epoch": 0.33402, + "grad_norm": 0.8178678837158063, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 33402 + }, + { + "epoch": 0.33403, + "grad_norm": 0.8171846780000046, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 33403 + }, + { + "epoch": 0.33404, + "grad_norm": 0.7637504805082586, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 33404 + }, + { + "epoch": 0.33405, + "grad_norm": 0.78490916268894, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 33405 + }, + { + "epoch": 0.33406, + "grad_norm": 0.7737647126991103, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 33406 + }, + { + "epoch": 0.33407, + "grad_norm": 0.830597861707749, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 33407 + }, + { + "epoch": 0.33408, + "grad_norm": 0.9068920520964764, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 33408 + }, + { + "epoch": 0.33409, + "grad_norm": 0.8898916527700895, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 33409 + }, + { + "epoch": 0.3341, + "grad_norm": 0.9533014795270639, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 33410 + }, + { + "epoch": 0.33411, + "grad_norm": 1.0183902931181759, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 33411 + }, + { + "epoch": 0.33412, + "grad_norm": 1.0695337901114388, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 33412 + }, + { + "epoch": 0.33413, + "grad_norm": 0.9424585803980258, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 33413 + }, + { + "epoch": 0.33414, + "grad_norm": 0.7926427786197242, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 33414 + }, + { + "epoch": 0.33415, + "grad_norm": 0.7841684851632081, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 33415 + }, + { + "epoch": 0.33416, + "grad_norm": 0.7932362454450294, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 33416 + }, + { + "epoch": 0.33417, + "grad_norm": 0.8505726323249602, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 33417 + }, + { + "epoch": 0.33418, + "grad_norm": 0.8868499442190434, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 33418 + }, + { + "epoch": 0.33419, + "grad_norm": 0.8539995990251088, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 33419 + }, + { + "epoch": 0.3342, + "grad_norm": 0.808349081846885, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 33420 + }, + { + "epoch": 0.33421, + "grad_norm": 0.6488981615711122, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 33421 + }, + { + "epoch": 0.33422, + "grad_norm": 0.5750322265093099, + "learning_rate": 0.003, + "loss": 4.034, + "step": 33422 + }, + { + "epoch": 0.33423, + "grad_norm": 0.5238950400625405, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 33423 + }, + { + "epoch": 0.33424, + "grad_norm": 0.5881696124622532, + "learning_rate": 0.003, + "loss": 4.043, + "step": 33424 + }, + { + "epoch": 0.33425, + "grad_norm": 0.5850745819719491, + "learning_rate": 0.003, + "loss": 4.054, + "step": 33425 + }, + { + "epoch": 0.33426, + "grad_norm": 0.654993031226601, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 33426 + }, + { + "epoch": 0.33427, + "grad_norm": 0.710482025132162, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 33427 + }, + { + "epoch": 0.33428, + "grad_norm": 0.7962968582896989, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 33428 + }, + { + "epoch": 0.33429, + "grad_norm": 1.0169009739627957, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 33429 + }, + { + "epoch": 0.3343, + "grad_norm": 1.166233969724563, + "learning_rate": 0.003, + "loss": 4.064, + "step": 33430 + }, + { + "epoch": 0.33431, + "grad_norm": 0.8417757833979131, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 33431 + }, + { + "epoch": 0.33432, + "grad_norm": 0.8384339708235556, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 33432 + }, + { + "epoch": 0.33433, + "grad_norm": 0.8459915354674105, + "learning_rate": 0.003, + "loss": 4.0066, + "step": 33433 + }, + { + "epoch": 0.33434, + "grad_norm": 0.9423443365450178, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 33434 + }, + { + "epoch": 0.33435, + "grad_norm": 0.869912428363126, + "learning_rate": 0.003, + "loss": 4.0069, + "step": 33435 + }, + { + "epoch": 0.33436, + "grad_norm": 0.9415144169549705, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 33436 + }, + { + "epoch": 0.33437, + "grad_norm": 1.0150367239400437, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 33437 + }, + { + "epoch": 0.33438, + "grad_norm": 1.0066312648284297, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 33438 + }, + { + "epoch": 0.33439, + "grad_norm": 1.1211641582453507, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 33439 + }, + { + "epoch": 0.3344, + "grad_norm": 0.9376414844849542, + "learning_rate": 0.003, + "loss": 4.062, + "step": 33440 + }, + { + "epoch": 0.33441, + "grad_norm": 1.0827740752440962, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 33441 + }, + { + "epoch": 0.33442, + "grad_norm": 1.079851839398803, + "learning_rate": 0.003, + "loss": 4.076, + "step": 33442 + }, + { + "epoch": 0.33443, + "grad_norm": 0.8658830522646865, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 33443 + }, + { + "epoch": 0.33444, + "grad_norm": 0.8426996690883429, + "learning_rate": 0.003, + "loss": 4.068, + "step": 33444 + }, + { + "epoch": 0.33445, + "grad_norm": 0.8143941663425606, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 33445 + }, + { + "epoch": 0.33446, + "grad_norm": 0.8660839115511949, + "learning_rate": 0.003, + "loss": 4.037, + "step": 33446 + }, + { + "epoch": 0.33447, + "grad_norm": 0.9553266242749251, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 33447 + }, + { + "epoch": 0.33448, + "grad_norm": 1.2729477117881887, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 33448 + }, + { + "epoch": 0.33449, + "grad_norm": 0.788203285422488, + "learning_rate": 0.003, + "loss": 4.041, + "step": 33449 + }, + { + "epoch": 0.3345, + "grad_norm": 0.7154054206118292, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 33450 + }, + { + "epoch": 0.33451, + "grad_norm": 0.7566436019733688, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 33451 + }, + { + "epoch": 0.33452, + "grad_norm": 0.7477463443748265, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 33452 + }, + { + "epoch": 0.33453, + "grad_norm": 0.7910376563083757, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 33453 + }, + { + "epoch": 0.33454, + "grad_norm": 0.8438335071022727, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 33454 + }, + { + "epoch": 0.33455, + "grad_norm": 0.7999201420421643, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 33455 + }, + { + "epoch": 0.33456, + "grad_norm": 0.8078301439137415, + "learning_rate": 0.003, + "loss": 4.0134, + "step": 33456 + }, + { + "epoch": 0.33457, + "grad_norm": 0.9172369153755748, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 33457 + }, + { + "epoch": 0.33458, + "grad_norm": 1.076612886918343, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 33458 + }, + { + "epoch": 0.33459, + "grad_norm": 0.9361910937059847, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 33459 + }, + { + "epoch": 0.3346, + "grad_norm": 0.7832802137151167, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 33460 + }, + { + "epoch": 0.33461, + "grad_norm": 0.6441892825156816, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 33461 + }, + { + "epoch": 0.33462, + "grad_norm": 0.7004834286542263, + "learning_rate": 0.003, + "loss": 4.069, + "step": 33462 + }, + { + "epoch": 0.33463, + "grad_norm": 0.6246980054397719, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 33463 + }, + { + "epoch": 0.33464, + "grad_norm": 0.7087798366136248, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 33464 + }, + { + "epoch": 0.33465, + "grad_norm": 0.6760655697340239, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 33465 + }, + { + "epoch": 0.33466, + "grad_norm": 0.6752056572128844, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 33466 + }, + { + "epoch": 0.33467, + "grad_norm": 0.8037842620489187, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 33467 + }, + { + "epoch": 0.33468, + "grad_norm": 0.9550500049232913, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 33468 + }, + { + "epoch": 0.33469, + "grad_norm": 0.9098043746304054, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 33469 + }, + { + "epoch": 0.3347, + "grad_norm": 0.6735099740264832, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 33470 + }, + { + "epoch": 0.33471, + "grad_norm": 0.6099054699074581, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 33471 + }, + { + "epoch": 0.33472, + "grad_norm": 0.7409331962393706, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 33472 + }, + { + "epoch": 0.33473, + "grad_norm": 0.7580832586106795, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 33473 + }, + { + "epoch": 0.33474, + "grad_norm": 0.788801196463899, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 33474 + }, + { + "epoch": 0.33475, + "grad_norm": 0.7675144571928182, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 33475 + }, + { + "epoch": 0.33476, + "grad_norm": 0.699106095156432, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 33476 + }, + { + "epoch": 0.33477, + "grad_norm": 0.6534834925644141, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 33477 + }, + { + "epoch": 0.33478, + "grad_norm": 0.6172583332945806, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 33478 + }, + { + "epoch": 0.33479, + "grad_norm": 0.6927279958322375, + "learning_rate": 0.003, + "loss": 4.0036, + "step": 33479 + }, + { + "epoch": 0.3348, + "grad_norm": 0.7499984993163773, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 33480 + }, + { + "epoch": 0.33481, + "grad_norm": 0.77208004144386, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 33481 + }, + { + "epoch": 0.33482, + "grad_norm": 0.7166004348920456, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 33482 + }, + { + "epoch": 0.33483, + "grad_norm": 0.7311637946440613, + "learning_rate": 0.003, + "loss": 4.008, + "step": 33483 + }, + { + "epoch": 0.33484, + "grad_norm": 0.8314427975961974, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 33484 + }, + { + "epoch": 0.33485, + "grad_norm": 0.9612630020359775, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 33485 + }, + { + "epoch": 0.33486, + "grad_norm": 1.0865671897687097, + "learning_rate": 0.003, + "loss": 4.032, + "step": 33486 + }, + { + "epoch": 0.33487, + "grad_norm": 0.9207722187890302, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 33487 + }, + { + "epoch": 0.33488, + "grad_norm": 0.8772632750026157, + "learning_rate": 0.003, + "loss": 4.028, + "step": 33488 + }, + { + "epoch": 0.33489, + "grad_norm": 0.8945441773116348, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 33489 + }, + { + "epoch": 0.3349, + "grad_norm": 0.9418057766643548, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 33490 + }, + { + "epoch": 0.33491, + "grad_norm": 1.0364769133825553, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 33491 + }, + { + "epoch": 0.33492, + "grad_norm": 1.047691807331402, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 33492 + }, + { + "epoch": 0.33493, + "grad_norm": 0.9758598841555752, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 33493 + }, + { + "epoch": 0.33494, + "grad_norm": 0.8666599567356137, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 33494 + }, + { + "epoch": 0.33495, + "grad_norm": 0.7837848736875653, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 33495 + }, + { + "epoch": 0.33496, + "grad_norm": 0.6720289867989876, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 33496 + }, + { + "epoch": 0.33497, + "grad_norm": 0.7969269255815048, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 33497 + }, + { + "epoch": 0.33498, + "grad_norm": 0.9371447038669312, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 33498 + }, + { + "epoch": 0.33499, + "grad_norm": 0.8813051562746631, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 33499 + }, + { + "epoch": 0.335, + "grad_norm": 0.7452557934591124, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 33500 + }, + { + "epoch": 0.33501, + "grad_norm": 0.7063572025967, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 33501 + }, + { + "epoch": 0.33502, + "grad_norm": 0.6605384912455938, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 33502 + }, + { + "epoch": 0.33503, + "grad_norm": 0.7856760938378934, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 33503 + }, + { + "epoch": 0.33504, + "grad_norm": 0.990539165310162, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 33504 + }, + { + "epoch": 0.33505, + "grad_norm": 1.2271671672265865, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 33505 + }, + { + "epoch": 0.33506, + "grad_norm": 0.9756346989812787, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 33506 + }, + { + "epoch": 0.33507, + "grad_norm": 1.0521701460024862, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 33507 + }, + { + "epoch": 0.33508, + "grad_norm": 0.8636386032198867, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 33508 + }, + { + "epoch": 0.33509, + "grad_norm": 0.8902431998971329, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 33509 + }, + { + "epoch": 0.3351, + "grad_norm": 0.7891654485515646, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 33510 + }, + { + "epoch": 0.33511, + "grad_norm": 0.7098602474287389, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 33511 + }, + { + "epoch": 0.33512, + "grad_norm": 0.8104152766077731, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 33512 + }, + { + "epoch": 0.33513, + "grad_norm": 0.9225202176828405, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 33513 + }, + { + "epoch": 0.33514, + "grad_norm": 0.8525060964878511, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 33514 + }, + { + "epoch": 0.33515, + "grad_norm": 0.8396774825751079, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 33515 + }, + { + "epoch": 0.33516, + "grad_norm": 0.8523550270247341, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 33516 + }, + { + "epoch": 0.33517, + "grad_norm": 0.9038406081204027, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 33517 + }, + { + "epoch": 0.33518, + "grad_norm": 0.9036610566089047, + "learning_rate": 0.003, + "loss": 4.028, + "step": 33518 + }, + { + "epoch": 0.33519, + "grad_norm": 0.8199293709236257, + "learning_rate": 0.003, + "loss": 4.0071, + "step": 33519 + }, + { + "epoch": 0.3352, + "grad_norm": 0.981300614635479, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 33520 + }, + { + "epoch": 0.33521, + "grad_norm": 1.1074675098194209, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 33521 + }, + { + "epoch": 0.33522, + "grad_norm": 1.071947485630254, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 33522 + }, + { + "epoch": 0.33523, + "grad_norm": 0.8517244491451923, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 33523 + }, + { + "epoch": 0.33524, + "grad_norm": 0.7104047852745636, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 33524 + }, + { + "epoch": 0.33525, + "grad_norm": 0.7429684524788617, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 33525 + }, + { + "epoch": 0.33526, + "grad_norm": 0.6862680593036469, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 33526 + }, + { + "epoch": 0.33527, + "grad_norm": 0.6051643274765951, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 33527 + }, + { + "epoch": 0.33528, + "grad_norm": 0.6414455771916885, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 33528 + }, + { + "epoch": 0.33529, + "grad_norm": 0.6937150298815976, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 33529 + }, + { + "epoch": 0.3353, + "grad_norm": 0.687733090238632, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 33530 + }, + { + "epoch": 0.33531, + "grad_norm": 0.7286216729197353, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 33531 + }, + { + "epoch": 0.33532, + "grad_norm": 0.7474821749271051, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 33532 + }, + { + "epoch": 0.33533, + "grad_norm": 0.7755996260099706, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 33533 + }, + { + "epoch": 0.33534, + "grad_norm": 0.887601695899262, + "learning_rate": 0.003, + "loss": 4.021, + "step": 33534 + }, + { + "epoch": 0.33535, + "grad_norm": 0.8679706282075552, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 33535 + }, + { + "epoch": 0.33536, + "grad_norm": 0.7469096714370186, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 33536 + }, + { + "epoch": 0.33537, + "grad_norm": 0.7306315425450887, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 33537 + }, + { + "epoch": 0.33538, + "grad_norm": 0.6994257134932458, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 33538 + }, + { + "epoch": 0.33539, + "grad_norm": 0.7170118195177576, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 33539 + }, + { + "epoch": 0.3354, + "grad_norm": 0.9195514664696472, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 33540 + }, + { + "epoch": 0.33541, + "grad_norm": 1.2413082430290139, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 33541 + }, + { + "epoch": 0.33542, + "grad_norm": 0.8220084764346933, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 33542 + }, + { + "epoch": 0.33543, + "grad_norm": 0.7021691356515798, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 33543 + }, + { + "epoch": 0.33544, + "grad_norm": 0.6708133080351312, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 33544 + }, + { + "epoch": 0.33545, + "grad_norm": 0.6937323111051921, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 33545 + }, + { + "epoch": 0.33546, + "grad_norm": 0.6405592964781028, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 33546 + }, + { + "epoch": 0.33547, + "grad_norm": 0.5923893310013788, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 33547 + }, + { + "epoch": 0.33548, + "grad_norm": 0.6567774079868601, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 33548 + }, + { + "epoch": 0.33549, + "grad_norm": 0.778111505029056, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 33549 + }, + { + "epoch": 0.3355, + "grad_norm": 0.8425385677834523, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 33550 + }, + { + "epoch": 0.33551, + "grad_norm": 0.9126983654811395, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 33551 + }, + { + "epoch": 0.33552, + "grad_norm": 1.0420663376188255, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 33552 + }, + { + "epoch": 0.33553, + "grad_norm": 1.0413664816664334, + "learning_rate": 0.003, + "loss": 4.059, + "step": 33553 + }, + { + "epoch": 0.33554, + "grad_norm": 0.9265316712741981, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 33554 + }, + { + "epoch": 0.33555, + "grad_norm": 0.9322532493863386, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 33555 + }, + { + "epoch": 0.33556, + "grad_norm": 1.0512132143797417, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 33556 + }, + { + "epoch": 0.33557, + "grad_norm": 1.0864026306413104, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 33557 + }, + { + "epoch": 0.33558, + "grad_norm": 1.0611270664298809, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 33558 + }, + { + "epoch": 0.33559, + "grad_norm": 0.9819778289622576, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 33559 + }, + { + "epoch": 0.3356, + "grad_norm": 0.9642082976140308, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 33560 + }, + { + "epoch": 0.33561, + "grad_norm": 0.9331276000729407, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 33561 + }, + { + "epoch": 0.33562, + "grad_norm": 0.8913522847478555, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 33562 + }, + { + "epoch": 0.33563, + "grad_norm": 1.12234459691052, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 33563 + }, + { + "epoch": 0.33564, + "grad_norm": 1.0219880317515369, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 33564 + }, + { + "epoch": 0.33565, + "grad_norm": 0.9117341748612894, + "learning_rate": 0.003, + "loss": 4.043, + "step": 33565 + }, + { + "epoch": 0.33566, + "grad_norm": 1.026076247329884, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 33566 + }, + { + "epoch": 0.33567, + "grad_norm": 1.104171858915612, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 33567 + }, + { + "epoch": 0.33568, + "grad_norm": 0.9939210631570862, + "learning_rate": 0.003, + "loss": 4.035, + "step": 33568 + }, + { + "epoch": 0.33569, + "grad_norm": 1.0430473680432613, + "learning_rate": 0.003, + "loss": 4.066, + "step": 33569 + }, + { + "epoch": 0.3357, + "grad_norm": 0.9253520419346458, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 33570 + }, + { + "epoch": 0.33571, + "grad_norm": 0.8039423286251106, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 33571 + }, + { + "epoch": 0.33572, + "grad_norm": 0.7364980089281515, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 33572 + }, + { + "epoch": 0.33573, + "grad_norm": 0.7413715412335512, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 33573 + }, + { + "epoch": 0.33574, + "grad_norm": 0.7096705774255924, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 33574 + }, + { + "epoch": 0.33575, + "grad_norm": 0.6701532182622197, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 33575 + }, + { + "epoch": 0.33576, + "grad_norm": 0.6076870341469583, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 33576 + }, + { + "epoch": 0.33577, + "grad_norm": 0.5813119915251628, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 33577 + }, + { + "epoch": 0.33578, + "grad_norm": 0.5336756142107969, + "learning_rate": 0.003, + "loss": 4.085, + "step": 33578 + }, + { + "epoch": 0.33579, + "grad_norm": 0.5116800094071948, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 33579 + }, + { + "epoch": 0.3358, + "grad_norm": 0.5586390034089612, + "learning_rate": 0.003, + "loss": 4.026, + "step": 33580 + }, + { + "epoch": 0.33581, + "grad_norm": 0.5494169250122006, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 33581 + }, + { + "epoch": 0.33582, + "grad_norm": 0.6241252380536063, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 33582 + }, + { + "epoch": 0.33583, + "grad_norm": 0.7883056592711065, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 33583 + }, + { + "epoch": 0.33584, + "grad_norm": 0.9625133168142012, + "learning_rate": 0.003, + "loss": 4.031, + "step": 33584 + }, + { + "epoch": 0.33585, + "grad_norm": 0.8874149010677069, + "learning_rate": 0.003, + "loss": 3.9974, + "step": 33585 + }, + { + "epoch": 0.33586, + "grad_norm": 0.7069268686628442, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 33586 + }, + { + "epoch": 0.33587, + "grad_norm": 0.6571131322707537, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 33587 + }, + { + "epoch": 0.33588, + "grad_norm": 0.6330083116229438, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 33588 + }, + { + "epoch": 0.33589, + "grad_norm": 0.6446224322959807, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 33589 + }, + { + "epoch": 0.3359, + "grad_norm": 0.6937806903911785, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 33590 + }, + { + "epoch": 0.33591, + "grad_norm": 0.7637299072414784, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 33591 + }, + { + "epoch": 0.33592, + "grad_norm": 0.7201971779386008, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 33592 + }, + { + "epoch": 0.33593, + "grad_norm": 0.7469280616477116, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 33593 + }, + { + "epoch": 0.33594, + "grad_norm": 0.8502883924404253, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 33594 + }, + { + "epoch": 0.33595, + "grad_norm": 0.9508864628861705, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 33595 + }, + { + "epoch": 0.33596, + "grad_norm": 0.8993536385986304, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 33596 + }, + { + "epoch": 0.33597, + "grad_norm": 0.946571821614601, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 33597 + }, + { + "epoch": 0.33598, + "grad_norm": 0.9964047155552305, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 33598 + }, + { + "epoch": 0.33599, + "grad_norm": 1.0515835074207691, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 33599 + }, + { + "epoch": 0.336, + "grad_norm": 1.0316345538006748, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 33600 + }, + { + "epoch": 0.33601, + "grad_norm": 1.0581854790699317, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 33601 + }, + { + "epoch": 0.33602, + "grad_norm": 1.0690935533164048, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 33602 + }, + { + "epoch": 0.33603, + "grad_norm": 1.0866548933742963, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 33603 + }, + { + "epoch": 0.33604, + "grad_norm": 0.9572430958557303, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 33604 + }, + { + "epoch": 0.33605, + "grad_norm": 1.0522957014862997, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 33605 + }, + { + "epoch": 0.33606, + "grad_norm": 0.9926888688482821, + "learning_rate": 0.003, + "loss": 4.066, + "step": 33606 + }, + { + "epoch": 0.33607, + "grad_norm": 1.076751246776696, + "learning_rate": 0.003, + "loss": 4.071, + "step": 33607 + }, + { + "epoch": 0.33608, + "grad_norm": 0.9915906862540832, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 33608 + }, + { + "epoch": 0.33609, + "grad_norm": 0.9371044354169651, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 33609 + }, + { + "epoch": 0.3361, + "grad_norm": 0.9645135972349086, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 33610 + }, + { + "epoch": 0.33611, + "grad_norm": 0.825670657187605, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 33611 + }, + { + "epoch": 0.33612, + "grad_norm": 0.7731019708277355, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 33612 + }, + { + "epoch": 0.33613, + "grad_norm": 0.8113392526737846, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 33613 + }, + { + "epoch": 0.33614, + "grad_norm": 0.7773188193018316, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 33614 + }, + { + "epoch": 0.33615, + "grad_norm": 0.7674169686530927, + "learning_rate": 0.003, + "loss": 4.059, + "step": 33615 + }, + { + "epoch": 0.33616, + "grad_norm": 0.7478962721218371, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 33616 + }, + { + "epoch": 0.33617, + "grad_norm": 0.6569174795283418, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 33617 + }, + { + "epoch": 0.33618, + "grad_norm": 0.7916644993925536, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 33618 + }, + { + "epoch": 0.33619, + "grad_norm": 1.0444404689157807, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 33619 + }, + { + "epoch": 0.3362, + "grad_norm": 1.3897323235681776, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 33620 + }, + { + "epoch": 0.33621, + "grad_norm": 0.5869293321695513, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 33621 + }, + { + "epoch": 0.33622, + "grad_norm": 0.808204560836411, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 33622 + }, + { + "epoch": 0.33623, + "grad_norm": 0.873072596370925, + "learning_rate": 0.003, + "loss": 4.058, + "step": 33623 + }, + { + "epoch": 0.33624, + "grad_norm": 0.7823250984489073, + "learning_rate": 0.003, + "loss": 4.029, + "step": 33624 + }, + { + "epoch": 0.33625, + "grad_norm": 0.8199875435623101, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 33625 + }, + { + "epoch": 0.33626, + "grad_norm": 0.7675983790034979, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 33626 + }, + { + "epoch": 0.33627, + "grad_norm": 0.7638764630141989, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 33627 + }, + { + "epoch": 0.33628, + "grad_norm": 0.7984148238622651, + "learning_rate": 0.003, + "loss": 4.067, + "step": 33628 + }, + { + "epoch": 0.33629, + "grad_norm": 0.7167186465908104, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 33629 + }, + { + "epoch": 0.3363, + "grad_norm": 0.5899983030177005, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 33630 + }, + { + "epoch": 0.33631, + "grad_norm": 0.5856574105830651, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 33631 + }, + { + "epoch": 0.33632, + "grad_norm": 0.5706383355837534, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 33632 + }, + { + "epoch": 0.33633, + "grad_norm": 0.6110297849270651, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 33633 + }, + { + "epoch": 0.33634, + "grad_norm": 0.6345777839586034, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 33634 + }, + { + "epoch": 0.33635, + "grad_norm": 0.6636477086617412, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 33635 + }, + { + "epoch": 0.33636, + "grad_norm": 0.7361149268051024, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 33636 + }, + { + "epoch": 0.33637, + "grad_norm": 0.8860355190386732, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 33637 + }, + { + "epoch": 0.33638, + "grad_norm": 1.1958196045003993, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 33638 + }, + { + "epoch": 0.33639, + "grad_norm": 1.0366855651951992, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 33639 + }, + { + "epoch": 0.3364, + "grad_norm": 0.7903674522209951, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 33640 + }, + { + "epoch": 0.33641, + "grad_norm": 0.7909345835777578, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 33641 + }, + { + "epoch": 0.33642, + "grad_norm": 0.874671546363745, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 33642 + }, + { + "epoch": 0.33643, + "grad_norm": 0.8767178439344934, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 33643 + }, + { + "epoch": 0.33644, + "grad_norm": 0.7661993858943851, + "learning_rate": 0.003, + "loss": 4.029, + "step": 33644 + }, + { + "epoch": 0.33645, + "grad_norm": 0.7987692670303714, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 33645 + }, + { + "epoch": 0.33646, + "grad_norm": 0.8741513571790588, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 33646 + }, + { + "epoch": 0.33647, + "grad_norm": 0.9145512880201334, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 33647 + }, + { + "epoch": 0.33648, + "grad_norm": 0.8829331282614442, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 33648 + }, + { + "epoch": 0.33649, + "grad_norm": 0.7420776974336835, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 33649 + }, + { + "epoch": 0.3365, + "grad_norm": 0.7857909331910413, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 33650 + }, + { + "epoch": 0.33651, + "grad_norm": 0.8613129125354198, + "learning_rate": 0.003, + "loss": 4.023, + "step": 33651 + }, + { + "epoch": 0.33652, + "grad_norm": 0.993851430434911, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 33652 + }, + { + "epoch": 0.33653, + "grad_norm": 0.9995989247863574, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 33653 + }, + { + "epoch": 0.33654, + "grad_norm": 0.9636129022636271, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 33654 + }, + { + "epoch": 0.33655, + "grad_norm": 0.8158394071431365, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 33655 + }, + { + "epoch": 0.33656, + "grad_norm": 0.8175438940916879, + "learning_rate": 0.003, + "loss": 4.051, + "step": 33656 + }, + { + "epoch": 0.33657, + "grad_norm": 0.9134112886931331, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 33657 + }, + { + "epoch": 0.33658, + "grad_norm": 0.8977398535858316, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 33658 + }, + { + "epoch": 0.33659, + "grad_norm": 0.938007301479686, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 33659 + }, + { + "epoch": 0.3366, + "grad_norm": 0.9014220950115539, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 33660 + }, + { + "epoch": 0.33661, + "grad_norm": 0.8931790975158279, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 33661 + }, + { + "epoch": 0.33662, + "grad_norm": 0.9453495245386333, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 33662 + }, + { + "epoch": 0.33663, + "grad_norm": 0.7771897120815943, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 33663 + }, + { + "epoch": 0.33664, + "grad_norm": 0.6816267950435487, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 33664 + }, + { + "epoch": 0.33665, + "grad_norm": 0.7519933077388464, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 33665 + }, + { + "epoch": 0.33666, + "grad_norm": 0.670810083297287, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 33666 + }, + { + "epoch": 0.33667, + "grad_norm": 0.6485022638629593, + "learning_rate": 0.003, + "loss": 4.032, + "step": 33667 + }, + { + "epoch": 0.33668, + "grad_norm": 0.7126986455539552, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 33668 + }, + { + "epoch": 0.33669, + "grad_norm": 1.0060010221335662, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 33669 + }, + { + "epoch": 0.3367, + "grad_norm": 1.3738223183805245, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 33670 + }, + { + "epoch": 0.33671, + "grad_norm": 0.6155110726688291, + "learning_rate": 0.003, + "loss": 4.0035, + "step": 33671 + }, + { + "epoch": 0.33672, + "grad_norm": 0.6552756859273646, + "learning_rate": 0.003, + "loss": 4.023, + "step": 33672 + }, + { + "epoch": 0.33673, + "grad_norm": 0.6331318239781898, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 33673 + }, + { + "epoch": 0.33674, + "grad_norm": 0.6199653128102041, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 33674 + }, + { + "epoch": 0.33675, + "grad_norm": 0.6463716723007972, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 33675 + }, + { + "epoch": 0.33676, + "grad_norm": 0.6865905547730407, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 33676 + }, + { + "epoch": 0.33677, + "grad_norm": 0.6644797220877094, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 33677 + }, + { + "epoch": 0.33678, + "grad_norm": 0.6129449638610547, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 33678 + }, + { + "epoch": 0.33679, + "grad_norm": 0.6996823077271874, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 33679 + }, + { + "epoch": 0.3368, + "grad_norm": 0.9462330542137047, + "learning_rate": 0.003, + "loss": 4.026, + "step": 33680 + }, + { + "epoch": 0.33681, + "grad_norm": 1.2048944988596013, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 33681 + }, + { + "epoch": 0.33682, + "grad_norm": 0.984115128170861, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 33682 + }, + { + "epoch": 0.33683, + "grad_norm": 0.7998684697696333, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 33683 + }, + { + "epoch": 0.33684, + "grad_norm": 0.6805048124053439, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 33684 + }, + { + "epoch": 0.33685, + "grad_norm": 0.5751741105359596, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 33685 + }, + { + "epoch": 0.33686, + "grad_norm": 0.5448355358246327, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 33686 + }, + { + "epoch": 0.33687, + "grad_norm": 0.5440085183920897, + "learning_rate": 0.003, + "loss": 4.0021, + "step": 33687 + }, + { + "epoch": 0.33688, + "grad_norm": 0.6644786693562702, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 33688 + }, + { + "epoch": 0.33689, + "grad_norm": 0.7788718592854356, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 33689 + }, + { + "epoch": 0.3369, + "grad_norm": 0.8263109832876955, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 33690 + }, + { + "epoch": 0.33691, + "grad_norm": 0.8749051722820884, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 33691 + }, + { + "epoch": 0.33692, + "grad_norm": 0.9497159616559773, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 33692 + }, + { + "epoch": 0.33693, + "grad_norm": 1.0547912898490093, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 33693 + }, + { + "epoch": 0.33694, + "grad_norm": 1.0700180046678145, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 33694 + }, + { + "epoch": 0.33695, + "grad_norm": 0.9393820806815416, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 33695 + }, + { + "epoch": 0.33696, + "grad_norm": 1.022833564704935, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 33696 + }, + { + "epoch": 0.33697, + "grad_norm": 1.081619061855931, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 33697 + }, + { + "epoch": 0.33698, + "grad_norm": 1.2224473216288065, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 33698 + }, + { + "epoch": 0.33699, + "grad_norm": 1.0296145550556874, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 33699 + }, + { + "epoch": 0.337, + "grad_norm": 0.7948861975686476, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 33700 + }, + { + "epoch": 0.33701, + "grad_norm": 0.7016093421820723, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 33701 + }, + { + "epoch": 0.33702, + "grad_norm": 0.6192690036724552, + "learning_rate": 0.003, + "loss": 4.035, + "step": 33702 + }, + { + "epoch": 0.33703, + "grad_norm": 0.6283667662964441, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 33703 + }, + { + "epoch": 0.33704, + "grad_norm": 0.6601439284802328, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 33704 + }, + { + "epoch": 0.33705, + "grad_norm": 0.6387464488781871, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 33705 + }, + { + "epoch": 0.33706, + "grad_norm": 0.7184624215387317, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 33706 + }, + { + "epoch": 0.33707, + "grad_norm": 0.9146334160884503, + "learning_rate": 0.003, + "loss": 4.05, + "step": 33707 + }, + { + "epoch": 0.33708, + "grad_norm": 1.1030600560273256, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 33708 + }, + { + "epoch": 0.33709, + "grad_norm": 0.866243458237003, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 33709 + }, + { + "epoch": 0.3371, + "grad_norm": 0.7828058969550554, + "learning_rate": 0.003, + "loss": 4.044, + "step": 33710 + }, + { + "epoch": 0.33711, + "grad_norm": 0.7171777944216865, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 33711 + }, + { + "epoch": 0.33712, + "grad_norm": 0.7091630800835255, + "learning_rate": 0.003, + "loss": 4.024, + "step": 33712 + }, + { + "epoch": 0.33713, + "grad_norm": 0.7129814720576563, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 33713 + }, + { + "epoch": 0.33714, + "grad_norm": 0.7443836434509767, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 33714 + }, + { + "epoch": 0.33715, + "grad_norm": 0.7978504803587637, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 33715 + }, + { + "epoch": 0.33716, + "grad_norm": 1.0511832784668238, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 33716 + }, + { + "epoch": 0.33717, + "grad_norm": 1.050841625546212, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 33717 + }, + { + "epoch": 0.33718, + "grad_norm": 1.0144876774409533, + "learning_rate": 0.003, + "loss": 4.047, + "step": 33718 + }, + { + "epoch": 0.33719, + "grad_norm": 0.9539808069223594, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 33719 + }, + { + "epoch": 0.3372, + "grad_norm": 0.9191649905101167, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 33720 + }, + { + "epoch": 0.33721, + "grad_norm": 1.0479468287899436, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 33721 + }, + { + "epoch": 0.33722, + "grad_norm": 0.8579074380774641, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 33722 + }, + { + "epoch": 0.33723, + "grad_norm": 0.7049855400400556, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 33723 + }, + { + "epoch": 0.33724, + "grad_norm": 0.624082936470447, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 33724 + }, + { + "epoch": 0.33725, + "grad_norm": 0.7087209720191705, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 33725 + }, + { + "epoch": 0.33726, + "grad_norm": 0.7502743865520788, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 33726 + }, + { + "epoch": 0.33727, + "grad_norm": 0.6627149796570143, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 33727 + }, + { + "epoch": 0.33728, + "grad_norm": 0.5346839962942674, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 33728 + }, + { + "epoch": 0.33729, + "grad_norm": 0.5361997695426446, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 33729 + }, + { + "epoch": 0.3373, + "grad_norm": 0.5693474006982953, + "learning_rate": 0.003, + "loss": 4.03, + "step": 33730 + }, + { + "epoch": 0.33731, + "grad_norm": 0.6642634981053736, + "learning_rate": 0.003, + "loss": 4.027, + "step": 33731 + }, + { + "epoch": 0.33732, + "grad_norm": 0.878114166881482, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 33732 + }, + { + "epoch": 0.33733, + "grad_norm": 1.3233062076376578, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 33733 + }, + { + "epoch": 0.33734, + "grad_norm": 0.6038469268795158, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 33734 + }, + { + "epoch": 0.33735, + "grad_norm": 0.696028703666814, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 33735 + }, + { + "epoch": 0.33736, + "grad_norm": 1.0810904032179156, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 33736 + }, + { + "epoch": 0.33737, + "grad_norm": 1.0671751705768573, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 33737 + }, + { + "epoch": 0.33738, + "grad_norm": 0.8449499775676197, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 33738 + }, + { + "epoch": 0.33739, + "grad_norm": 0.7887234417312652, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 33739 + }, + { + "epoch": 0.3374, + "grad_norm": 0.9723895713583691, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 33740 + }, + { + "epoch": 0.33741, + "grad_norm": 1.162317984052887, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 33741 + }, + { + "epoch": 0.33742, + "grad_norm": 1.0167555695562032, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 33742 + }, + { + "epoch": 0.33743, + "grad_norm": 0.9762673040015455, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 33743 + }, + { + "epoch": 0.33744, + "grad_norm": 0.9420215015286822, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 33744 + }, + { + "epoch": 0.33745, + "grad_norm": 0.9974545233684636, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 33745 + }, + { + "epoch": 0.33746, + "grad_norm": 1.047316122945228, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 33746 + }, + { + "epoch": 0.33747, + "grad_norm": 1.02788321708909, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 33747 + }, + { + "epoch": 0.33748, + "grad_norm": 0.9621453639986354, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 33748 + }, + { + "epoch": 0.33749, + "grad_norm": 0.7753507389702278, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 33749 + }, + { + "epoch": 0.3375, + "grad_norm": 0.7498129684899302, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 33750 + }, + { + "epoch": 0.33751, + "grad_norm": 0.8573142913748312, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 33751 + }, + { + "epoch": 0.33752, + "grad_norm": 0.9865724859907175, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 33752 + }, + { + "epoch": 0.33753, + "grad_norm": 1.0244092243291494, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 33753 + }, + { + "epoch": 0.33754, + "grad_norm": 0.8906472731390455, + "learning_rate": 0.003, + "loss": 4.069, + "step": 33754 + }, + { + "epoch": 0.33755, + "grad_norm": 0.9583044904772331, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 33755 + }, + { + "epoch": 0.33756, + "grad_norm": 1.0169105579582782, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 33756 + }, + { + "epoch": 0.33757, + "grad_norm": 1.031396706040273, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 33757 + }, + { + "epoch": 0.33758, + "grad_norm": 0.865718027429324, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 33758 + }, + { + "epoch": 0.33759, + "grad_norm": 0.7855548780303279, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 33759 + }, + { + "epoch": 0.3376, + "grad_norm": 0.8459300000118278, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 33760 + }, + { + "epoch": 0.33761, + "grad_norm": 1.0958415695589119, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 33761 + }, + { + "epoch": 0.33762, + "grad_norm": 1.020360733991124, + "learning_rate": 0.003, + "loss": 4.074, + "step": 33762 + }, + { + "epoch": 0.33763, + "grad_norm": 1.012402323547838, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 33763 + }, + { + "epoch": 0.33764, + "grad_norm": 0.864281530533913, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 33764 + }, + { + "epoch": 0.33765, + "grad_norm": 0.7253130091408841, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 33765 + }, + { + "epoch": 0.33766, + "grad_norm": 0.6422790038199969, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 33766 + }, + { + "epoch": 0.33767, + "grad_norm": 0.6274937730802178, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 33767 + }, + { + "epoch": 0.33768, + "grad_norm": 0.6524088958063811, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 33768 + }, + { + "epoch": 0.33769, + "grad_norm": 0.6233708698861566, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 33769 + }, + { + "epoch": 0.3377, + "grad_norm": 0.6706095291185132, + "learning_rate": 0.003, + "loss": 3.9987, + "step": 33770 + }, + { + "epoch": 0.33771, + "grad_norm": 0.6600965645514117, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 33771 + }, + { + "epoch": 0.33772, + "grad_norm": 0.62100640408022, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 33772 + }, + { + "epoch": 0.33773, + "grad_norm": 0.6640184891560466, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 33773 + }, + { + "epoch": 0.33774, + "grad_norm": 0.6402224688503386, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 33774 + }, + { + "epoch": 0.33775, + "grad_norm": 0.6782496588249532, + "learning_rate": 0.003, + "loss": 4.0064, + "step": 33775 + }, + { + "epoch": 0.33776, + "grad_norm": 0.9506969353423204, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 33776 + }, + { + "epoch": 0.33777, + "grad_norm": 1.392708177996448, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 33777 + }, + { + "epoch": 0.33778, + "grad_norm": 0.5795193119261093, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 33778 + }, + { + "epoch": 0.33779, + "grad_norm": 0.7761015423260159, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 33779 + }, + { + "epoch": 0.3378, + "grad_norm": 0.9514346302334404, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 33780 + }, + { + "epoch": 0.33781, + "grad_norm": 0.9336378997415741, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 33781 + }, + { + "epoch": 0.33782, + "grad_norm": 0.9094389881063851, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 33782 + }, + { + "epoch": 0.33783, + "grad_norm": 0.7847867544099536, + "learning_rate": 0.003, + "loss": 4.0077, + "step": 33783 + }, + { + "epoch": 0.33784, + "grad_norm": 0.6865547692224602, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 33784 + }, + { + "epoch": 0.33785, + "grad_norm": 0.6318388651387125, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 33785 + }, + { + "epoch": 0.33786, + "grad_norm": 0.6640374444864019, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 33786 + }, + { + "epoch": 0.33787, + "grad_norm": 0.7648249150247822, + "learning_rate": 0.003, + "loss": 4.042, + "step": 33787 + }, + { + "epoch": 0.33788, + "grad_norm": 0.766325275480788, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 33788 + }, + { + "epoch": 0.33789, + "grad_norm": 0.858532639174059, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 33789 + }, + { + "epoch": 0.3379, + "grad_norm": 0.951045205251094, + "learning_rate": 0.003, + "loss": 4.035, + "step": 33790 + }, + { + "epoch": 0.33791, + "grad_norm": 0.9298190962377589, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 33791 + }, + { + "epoch": 0.33792, + "grad_norm": 0.9441326909092653, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 33792 + }, + { + "epoch": 0.33793, + "grad_norm": 0.9721437550451898, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 33793 + }, + { + "epoch": 0.33794, + "grad_norm": 0.8942603192329401, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 33794 + }, + { + "epoch": 0.33795, + "grad_norm": 0.8612172935880857, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 33795 + }, + { + "epoch": 0.33796, + "grad_norm": 0.882955891135359, + "learning_rate": 0.003, + "loss": 4.058, + "step": 33796 + }, + { + "epoch": 0.33797, + "grad_norm": 0.8824934084216972, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 33797 + }, + { + "epoch": 0.33798, + "grad_norm": 0.9883406607215025, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 33798 + }, + { + "epoch": 0.33799, + "grad_norm": 1.0566472667122047, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 33799 + }, + { + "epoch": 0.338, + "grad_norm": 0.8839962310425998, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 33800 + }, + { + "epoch": 0.33801, + "grad_norm": 0.7845811748934701, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 33801 + }, + { + "epoch": 0.33802, + "grad_norm": 0.7770584710378017, + "learning_rate": 0.003, + "loss": 4.051, + "step": 33802 + }, + { + "epoch": 0.33803, + "grad_norm": 0.7406397409570852, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 33803 + }, + { + "epoch": 0.33804, + "grad_norm": 0.7352998325867052, + "learning_rate": 0.003, + "loss": 3.9835, + "step": 33804 + }, + { + "epoch": 0.33805, + "grad_norm": 0.791367794128499, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 33805 + }, + { + "epoch": 0.33806, + "grad_norm": 0.8487970159084213, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 33806 + }, + { + "epoch": 0.33807, + "grad_norm": 0.9297580414380384, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 33807 + }, + { + "epoch": 0.33808, + "grad_norm": 0.9863059192325285, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 33808 + }, + { + "epoch": 0.33809, + "grad_norm": 1.0595019645137196, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 33809 + }, + { + "epoch": 0.3381, + "grad_norm": 0.8549935067457187, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 33810 + }, + { + "epoch": 0.33811, + "grad_norm": 0.7434150034506393, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 33811 + }, + { + "epoch": 0.33812, + "grad_norm": 0.732443026899655, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 33812 + }, + { + "epoch": 0.33813, + "grad_norm": 0.6394441098591139, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 33813 + }, + { + "epoch": 0.33814, + "grad_norm": 0.6593133801120066, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 33814 + }, + { + "epoch": 0.33815, + "grad_norm": 0.5546760161598798, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 33815 + }, + { + "epoch": 0.33816, + "grad_norm": 0.551799298160508, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 33816 + }, + { + "epoch": 0.33817, + "grad_norm": 0.6308094122877197, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 33817 + }, + { + "epoch": 0.33818, + "grad_norm": 0.6738498287449948, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 33818 + }, + { + "epoch": 0.33819, + "grad_norm": 0.7803895787172441, + "learning_rate": 0.003, + "loss": 4.053, + "step": 33819 + }, + { + "epoch": 0.3382, + "grad_norm": 0.886281732424119, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 33820 + }, + { + "epoch": 0.33821, + "grad_norm": 0.8765366089642417, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 33821 + }, + { + "epoch": 0.33822, + "grad_norm": 0.8468068345465231, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 33822 + }, + { + "epoch": 0.33823, + "grad_norm": 0.8140039293249891, + "learning_rate": 0.003, + "loss": 4.029, + "step": 33823 + }, + { + "epoch": 0.33824, + "grad_norm": 0.9490528531709089, + "learning_rate": 0.003, + "loss": 4.022, + "step": 33824 + }, + { + "epoch": 0.33825, + "grad_norm": 1.127976542735466, + "learning_rate": 0.003, + "loss": 4.033, + "step": 33825 + }, + { + "epoch": 0.33826, + "grad_norm": 0.8872414865733492, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 33826 + }, + { + "epoch": 0.33827, + "grad_norm": 0.8125460762829133, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 33827 + }, + { + "epoch": 0.33828, + "grad_norm": 0.8597610011335546, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 33828 + }, + { + "epoch": 0.33829, + "grad_norm": 0.7659670046658855, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 33829 + }, + { + "epoch": 0.3383, + "grad_norm": 0.6466119535673419, + "learning_rate": 0.003, + "loss": 4.005, + "step": 33830 + }, + { + "epoch": 0.33831, + "grad_norm": 0.668991323831408, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 33831 + }, + { + "epoch": 0.33832, + "grad_norm": 0.6399733485191847, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 33832 + }, + { + "epoch": 0.33833, + "grad_norm": 0.6870153089244928, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 33833 + }, + { + "epoch": 0.33834, + "grad_norm": 0.8011481327802343, + "learning_rate": 0.003, + "loss": 4.0064, + "step": 33834 + }, + { + "epoch": 0.33835, + "grad_norm": 0.9422141196154757, + "learning_rate": 0.003, + "loss": 4.045, + "step": 33835 + }, + { + "epoch": 0.33836, + "grad_norm": 1.066269169322221, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 33836 + }, + { + "epoch": 0.33837, + "grad_norm": 1.0474269708866115, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 33837 + }, + { + "epoch": 0.33838, + "grad_norm": 0.8981477826465813, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 33838 + }, + { + "epoch": 0.33839, + "grad_norm": 0.716292037558805, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 33839 + }, + { + "epoch": 0.3384, + "grad_norm": 0.6982057676058101, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 33840 + }, + { + "epoch": 0.33841, + "grad_norm": 0.7708414426348612, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 33841 + }, + { + "epoch": 0.33842, + "grad_norm": 0.9415533452580352, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 33842 + }, + { + "epoch": 0.33843, + "grad_norm": 1.1446530133067467, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 33843 + }, + { + "epoch": 0.33844, + "grad_norm": 0.952355254581042, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 33844 + }, + { + "epoch": 0.33845, + "grad_norm": 1.0192463098079565, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 33845 + }, + { + "epoch": 0.33846, + "grad_norm": 1.0353314678797443, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 33846 + }, + { + "epoch": 0.33847, + "grad_norm": 1.0163923848392722, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 33847 + }, + { + "epoch": 0.33848, + "grad_norm": 0.8638210697912252, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 33848 + }, + { + "epoch": 0.33849, + "grad_norm": 0.7865636205890819, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 33849 + }, + { + "epoch": 0.3385, + "grad_norm": 0.733418168445731, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 33850 + }, + { + "epoch": 0.33851, + "grad_norm": 0.7999615393289004, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 33851 + }, + { + "epoch": 0.33852, + "grad_norm": 0.9279038084876917, + "learning_rate": 0.003, + "loss": 4.017, + "step": 33852 + }, + { + "epoch": 0.33853, + "grad_norm": 0.9738942269160589, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 33853 + }, + { + "epoch": 0.33854, + "grad_norm": 1.0194273700353533, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 33854 + }, + { + "epoch": 0.33855, + "grad_norm": 0.8991948194746865, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 33855 + }, + { + "epoch": 0.33856, + "grad_norm": 0.9951685827787403, + "learning_rate": 0.003, + "loss": 4.026, + "step": 33856 + }, + { + "epoch": 0.33857, + "grad_norm": 1.0705511544722706, + "learning_rate": 0.003, + "loss": 4.04, + "step": 33857 + }, + { + "epoch": 0.33858, + "grad_norm": 0.9591458755190022, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 33858 + }, + { + "epoch": 0.33859, + "grad_norm": 0.956784518143416, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 33859 + }, + { + "epoch": 0.3386, + "grad_norm": 0.9762666824630115, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 33860 + }, + { + "epoch": 0.33861, + "grad_norm": 0.927142664764423, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 33861 + }, + { + "epoch": 0.33862, + "grad_norm": 1.0025601305493832, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 33862 + }, + { + "epoch": 0.33863, + "grad_norm": 0.8256886273732722, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 33863 + }, + { + "epoch": 0.33864, + "grad_norm": 0.6667967093666312, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 33864 + }, + { + "epoch": 0.33865, + "grad_norm": 0.727284170492555, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 33865 + }, + { + "epoch": 0.33866, + "grad_norm": 0.6716632324289056, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 33866 + }, + { + "epoch": 0.33867, + "grad_norm": 0.5907775819845212, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 33867 + }, + { + "epoch": 0.33868, + "grad_norm": 0.6200656805926703, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 33868 + }, + { + "epoch": 0.33869, + "grad_norm": 0.5988276375441735, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 33869 + }, + { + "epoch": 0.3387, + "grad_norm": 0.5969392478144382, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 33870 + }, + { + "epoch": 0.33871, + "grad_norm": 0.6364715901463579, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 33871 + }, + { + "epoch": 0.33872, + "grad_norm": 0.8102056790940111, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 33872 + }, + { + "epoch": 0.33873, + "grad_norm": 0.9610363744479001, + "learning_rate": 0.003, + "loss": 4.059, + "step": 33873 + }, + { + "epoch": 0.33874, + "grad_norm": 0.9779386168157106, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 33874 + }, + { + "epoch": 0.33875, + "grad_norm": 0.9898314085580503, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 33875 + }, + { + "epoch": 0.33876, + "grad_norm": 0.9704065301153844, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 33876 + }, + { + "epoch": 0.33877, + "grad_norm": 0.7093498998371275, + "learning_rate": 0.003, + "loss": 4.024, + "step": 33877 + }, + { + "epoch": 0.33878, + "grad_norm": 0.6650131618031551, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 33878 + }, + { + "epoch": 0.33879, + "grad_norm": 0.7183072582243293, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 33879 + }, + { + "epoch": 0.3388, + "grad_norm": 0.6754542598339904, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 33880 + }, + { + "epoch": 0.33881, + "grad_norm": 0.6847795594648908, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 33881 + }, + { + "epoch": 0.33882, + "grad_norm": 0.7400395087204644, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 33882 + }, + { + "epoch": 0.33883, + "grad_norm": 0.8589590828997828, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 33883 + }, + { + "epoch": 0.33884, + "grad_norm": 0.9289306099553821, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 33884 + }, + { + "epoch": 0.33885, + "grad_norm": 0.800640792990739, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 33885 + }, + { + "epoch": 0.33886, + "grad_norm": 0.834523650076088, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 33886 + }, + { + "epoch": 0.33887, + "grad_norm": 0.9679795628922451, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 33887 + }, + { + "epoch": 0.33888, + "grad_norm": 0.9319319086745497, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 33888 + }, + { + "epoch": 0.33889, + "grad_norm": 0.8257557339882334, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 33889 + }, + { + "epoch": 0.3389, + "grad_norm": 0.9070968517682227, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 33890 + }, + { + "epoch": 0.33891, + "grad_norm": 0.9442435272323383, + "learning_rate": 0.003, + "loss": 4.059, + "step": 33891 + }, + { + "epoch": 0.33892, + "grad_norm": 0.9284791352487384, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 33892 + }, + { + "epoch": 0.33893, + "grad_norm": 0.9888968675124461, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 33893 + }, + { + "epoch": 0.33894, + "grad_norm": 0.9292441513275207, + "learning_rate": 0.003, + "loss": 4.056, + "step": 33894 + }, + { + "epoch": 0.33895, + "grad_norm": 0.8184736492840081, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 33895 + }, + { + "epoch": 0.33896, + "grad_norm": 0.6972622789561078, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 33896 + }, + { + "epoch": 0.33897, + "grad_norm": 0.7391064175844099, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 33897 + }, + { + "epoch": 0.33898, + "grad_norm": 0.7000797647075282, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 33898 + }, + { + "epoch": 0.33899, + "grad_norm": 0.6806905625410583, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 33899 + }, + { + "epoch": 0.339, + "grad_norm": 0.8690201398200897, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 33900 + }, + { + "epoch": 0.33901, + "grad_norm": 1.0287014063106046, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 33901 + }, + { + "epoch": 0.33902, + "grad_norm": 1.0950433315026802, + "learning_rate": 0.003, + "loss": 4.035, + "step": 33902 + }, + { + "epoch": 0.33903, + "grad_norm": 1.0291223986569278, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 33903 + }, + { + "epoch": 0.33904, + "grad_norm": 1.0881742538989359, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 33904 + }, + { + "epoch": 0.33905, + "grad_norm": 0.9089145461989396, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 33905 + }, + { + "epoch": 0.33906, + "grad_norm": 0.907672643636694, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 33906 + }, + { + "epoch": 0.33907, + "grad_norm": 0.9193154513659452, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 33907 + }, + { + "epoch": 0.33908, + "grad_norm": 0.8434930238866593, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 33908 + }, + { + "epoch": 0.33909, + "grad_norm": 0.7817617634477758, + "learning_rate": 0.003, + "loss": 4.0087, + "step": 33909 + }, + { + "epoch": 0.3391, + "grad_norm": 0.8184585773660709, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 33910 + }, + { + "epoch": 0.33911, + "grad_norm": 0.9509032226951651, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 33911 + }, + { + "epoch": 0.33912, + "grad_norm": 1.0450757195679297, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 33912 + }, + { + "epoch": 0.33913, + "grad_norm": 0.9549056090915814, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 33913 + }, + { + "epoch": 0.33914, + "grad_norm": 1.0151572512788025, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 33914 + }, + { + "epoch": 0.33915, + "grad_norm": 0.9899429975718765, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 33915 + }, + { + "epoch": 0.33916, + "grad_norm": 1.0165365246587603, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 33916 + }, + { + "epoch": 0.33917, + "grad_norm": 0.8557490629076506, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 33917 + }, + { + "epoch": 0.33918, + "grad_norm": 0.7005441171152348, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 33918 + }, + { + "epoch": 0.33919, + "grad_norm": 0.6688066978284886, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 33919 + }, + { + "epoch": 0.3392, + "grad_norm": 0.579532699428292, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 33920 + }, + { + "epoch": 0.33921, + "grad_norm": 0.608005173501639, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 33921 + }, + { + "epoch": 0.33922, + "grad_norm": 0.6343890050967571, + "learning_rate": 0.003, + "loss": 4.0037, + "step": 33922 + }, + { + "epoch": 0.33923, + "grad_norm": 0.6242461943212776, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 33923 + }, + { + "epoch": 0.33924, + "grad_norm": 0.5527384875407005, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 33924 + }, + { + "epoch": 0.33925, + "grad_norm": 0.4839709214601328, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 33925 + }, + { + "epoch": 0.33926, + "grad_norm": 0.6095916516369055, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 33926 + }, + { + "epoch": 0.33927, + "grad_norm": 0.6524598232060316, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 33927 + }, + { + "epoch": 0.33928, + "grad_norm": 0.6409222711416348, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 33928 + }, + { + "epoch": 0.33929, + "grad_norm": 0.6054119299147679, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 33929 + }, + { + "epoch": 0.3393, + "grad_norm": 0.6718707521160994, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 33930 + }, + { + "epoch": 0.33931, + "grad_norm": 0.8492154360351177, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 33931 + }, + { + "epoch": 0.33932, + "grad_norm": 1.1363820900629902, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 33932 + }, + { + "epoch": 0.33933, + "grad_norm": 1.18482528867355, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 33933 + }, + { + "epoch": 0.33934, + "grad_norm": 0.7876477308776886, + "learning_rate": 0.003, + "loss": 4.0014, + "step": 33934 + }, + { + "epoch": 0.33935, + "grad_norm": 0.6987729511030895, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 33935 + }, + { + "epoch": 0.33936, + "grad_norm": 0.6446699575264577, + "learning_rate": 0.003, + "loss": 4.017, + "step": 33936 + }, + { + "epoch": 0.33937, + "grad_norm": 0.7463389124676082, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 33937 + }, + { + "epoch": 0.33938, + "grad_norm": 0.8568218017450804, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 33938 + }, + { + "epoch": 0.33939, + "grad_norm": 0.8028665628090043, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 33939 + }, + { + "epoch": 0.3394, + "grad_norm": 0.8306171453670893, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 33940 + }, + { + "epoch": 0.33941, + "grad_norm": 0.9429347869024594, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 33941 + }, + { + "epoch": 0.33942, + "grad_norm": 1.003440560699561, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 33942 + }, + { + "epoch": 0.33943, + "grad_norm": 1.018685539938234, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 33943 + }, + { + "epoch": 0.33944, + "grad_norm": 0.999758026657778, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 33944 + }, + { + "epoch": 0.33945, + "grad_norm": 0.9001894819472159, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 33945 + }, + { + "epoch": 0.33946, + "grad_norm": 0.7491383780742519, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 33946 + }, + { + "epoch": 0.33947, + "grad_norm": 0.794097073614366, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 33947 + }, + { + "epoch": 0.33948, + "grad_norm": 0.8145666147577381, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 33948 + }, + { + "epoch": 0.33949, + "grad_norm": 0.8418176852530579, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 33949 + }, + { + "epoch": 0.3395, + "grad_norm": 0.8031660879662917, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 33950 + }, + { + "epoch": 0.33951, + "grad_norm": 0.7998934076220618, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 33951 + }, + { + "epoch": 0.33952, + "grad_norm": 0.9087819939081989, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 33952 + }, + { + "epoch": 0.33953, + "grad_norm": 0.9322337828327047, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 33953 + }, + { + "epoch": 0.33954, + "grad_norm": 0.8859636258308221, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 33954 + }, + { + "epoch": 0.33955, + "grad_norm": 0.778486998371977, + "learning_rate": 0.003, + "loss": 4.065, + "step": 33955 + }, + { + "epoch": 0.33956, + "grad_norm": 0.8016595452512968, + "learning_rate": 0.003, + "loss": 4.0059, + "step": 33956 + }, + { + "epoch": 0.33957, + "grad_norm": 0.9232747762393058, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 33957 + }, + { + "epoch": 0.33958, + "grad_norm": 1.0837467413276234, + "learning_rate": 0.003, + "loss": 4.045, + "step": 33958 + }, + { + "epoch": 0.33959, + "grad_norm": 0.8708310193033623, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 33959 + }, + { + "epoch": 0.3396, + "grad_norm": 0.7797993622130089, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 33960 + }, + { + "epoch": 0.33961, + "grad_norm": 0.6730679547674745, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 33961 + }, + { + "epoch": 0.33962, + "grad_norm": 0.7344531463258496, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 33962 + }, + { + "epoch": 0.33963, + "grad_norm": 0.805683984962356, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 33963 + }, + { + "epoch": 0.33964, + "grad_norm": 0.8573894402592114, + "learning_rate": 0.003, + "loss": 3.9925, + "step": 33964 + }, + { + "epoch": 0.33965, + "grad_norm": 0.8397193555461177, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 33965 + }, + { + "epoch": 0.33966, + "grad_norm": 0.8354234908248966, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 33966 + }, + { + "epoch": 0.33967, + "grad_norm": 0.832628518718302, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 33967 + }, + { + "epoch": 0.33968, + "grad_norm": 0.8827716141383298, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 33968 + }, + { + "epoch": 0.33969, + "grad_norm": 1.0843105209490447, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 33969 + }, + { + "epoch": 0.3397, + "grad_norm": 1.118876305433769, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 33970 + }, + { + "epoch": 0.33971, + "grad_norm": 0.8240167196639178, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 33971 + }, + { + "epoch": 0.33972, + "grad_norm": 0.7091845877462508, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 33972 + }, + { + "epoch": 0.33973, + "grad_norm": 0.6037823851005784, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 33973 + }, + { + "epoch": 0.33974, + "grad_norm": 0.6305169759226481, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 33974 + }, + { + "epoch": 0.33975, + "grad_norm": 0.7030336458663832, + "learning_rate": 0.003, + "loss": 3.9923, + "step": 33975 + }, + { + "epoch": 0.33976, + "grad_norm": 0.7334082368041822, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 33976 + }, + { + "epoch": 0.33977, + "grad_norm": 0.821646163544227, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 33977 + }, + { + "epoch": 0.33978, + "grad_norm": 0.8156406817289322, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 33978 + }, + { + "epoch": 0.33979, + "grad_norm": 0.766630193106133, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 33979 + }, + { + "epoch": 0.3398, + "grad_norm": 0.7663197666229395, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 33980 + }, + { + "epoch": 0.33981, + "grad_norm": 0.8472479667847751, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 33981 + }, + { + "epoch": 0.33982, + "grad_norm": 0.9859562433248386, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 33982 + }, + { + "epoch": 0.33983, + "grad_norm": 1.1190536882855842, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 33983 + }, + { + "epoch": 0.33984, + "grad_norm": 0.908836173729518, + "learning_rate": 0.003, + "loss": 4.031, + "step": 33984 + }, + { + "epoch": 0.33985, + "grad_norm": 0.94922326366902, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 33985 + }, + { + "epoch": 0.33986, + "grad_norm": 0.8023988802160777, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 33986 + }, + { + "epoch": 0.33987, + "grad_norm": 0.8540913040229845, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 33987 + }, + { + "epoch": 0.33988, + "grad_norm": 0.7839512907011877, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 33988 + }, + { + "epoch": 0.33989, + "grad_norm": 0.7892693763947001, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 33989 + }, + { + "epoch": 0.3399, + "grad_norm": 0.9671844311715436, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 33990 + }, + { + "epoch": 0.33991, + "grad_norm": 1.2058800017127689, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 33991 + }, + { + "epoch": 0.33992, + "grad_norm": 1.0469266471284238, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 33992 + }, + { + "epoch": 0.33993, + "grad_norm": 0.9374829411302804, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 33993 + }, + { + "epoch": 0.33994, + "grad_norm": 0.7611668201801775, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 33994 + }, + { + "epoch": 0.33995, + "grad_norm": 0.7201002905573799, + "learning_rate": 0.003, + "loss": 4.027, + "step": 33995 + }, + { + "epoch": 0.33996, + "grad_norm": 0.7542273113510776, + "learning_rate": 0.003, + "loss": 4.04, + "step": 33996 + }, + { + "epoch": 0.33997, + "grad_norm": 0.8414992382540966, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 33997 + }, + { + "epoch": 0.33998, + "grad_norm": 1.0506033711003715, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 33998 + }, + { + "epoch": 0.33999, + "grad_norm": 1.0494184252834238, + "learning_rate": 0.003, + "loss": 4.062, + "step": 33999 + }, + { + "epoch": 0.34, + "grad_norm": 0.808229401525309, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 34000 + }, + { + "epoch": 0.34001, + "grad_norm": 0.6813712460308294, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 34001 + }, + { + "epoch": 0.34002, + "grad_norm": 0.7061821576133944, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 34002 + }, + { + "epoch": 0.34003, + "grad_norm": 0.7761118187110538, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 34003 + }, + { + "epoch": 0.34004, + "grad_norm": 0.8928893587257872, + "learning_rate": 0.003, + "loss": 4.061, + "step": 34004 + }, + { + "epoch": 0.34005, + "grad_norm": 1.0744947409562577, + "learning_rate": 0.003, + "loss": 4.038, + "step": 34005 + }, + { + "epoch": 0.34006, + "grad_norm": 0.768060507798, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 34006 + }, + { + "epoch": 0.34007, + "grad_norm": 0.5726100003169947, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 34007 + }, + { + "epoch": 0.34008, + "grad_norm": 0.7205296492804241, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 34008 + }, + { + "epoch": 0.34009, + "grad_norm": 0.8126641289915525, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 34009 + }, + { + "epoch": 0.3401, + "grad_norm": 0.8405270132090459, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 34010 + }, + { + "epoch": 0.34011, + "grad_norm": 0.8016598353678006, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 34011 + }, + { + "epoch": 0.34012, + "grad_norm": 0.7236283488909129, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 34012 + }, + { + "epoch": 0.34013, + "grad_norm": 0.6918793412724296, + "learning_rate": 0.003, + "loss": 4.031, + "step": 34013 + }, + { + "epoch": 0.34014, + "grad_norm": 0.6888009211995693, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 34014 + }, + { + "epoch": 0.34015, + "grad_norm": 0.6414926492987785, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 34015 + }, + { + "epoch": 0.34016, + "grad_norm": 0.6788533880304966, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 34016 + }, + { + "epoch": 0.34017, + "grad_norm": 0.8881637199886941, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 34017 + }, + { + "epoch": 0.34018, + "grad_norm": 1.1383292408221541, + "learning_rate": 0.003, + "loss": 4.029, + "step": 34018 + }, + { + "epoch": 0.34019, + "grad_norm": 0.862798995041277, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 34019 + }, + { + "epoch": 0.3402, + "grad_norm": 0.8807963523114474, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 34020 + }, + { + "epoch": 0.34021, + "grad_norm": 0.8937778301909161, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 34021 + }, + { + "epoch": 0.34022, + "grad_norm": 0.9040918660881367, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 34022 + }, + { + "epoch": 0.34023, + "grad_norm": 0.8931219188168983, + "learning_rate": 0.003, + "loss": 4.0059, + "step": 34023 + }, + { + "epoch": 0.34024, + "grad_norm": 0.8368951453772987, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 34024 + }, + { + "epoch": 0.34025, + "grad_norm": 0.820385441185719, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 34025 + }, + { + "epoch": 0.34026, + "grad_norm": 0.9104867951152543, + "learning_rate": 0.003, + "loss": 4.008, + "step": 34026 + }, + { + "epoch": 0.34027, + "grad_norm": 1.0213425725725176, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 34027 + }, + { + "epoch": 0.34028, + "grad_norm": 1.0102829346173594, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 34028 + }, + { + "epoch": 0.34029, + "grad_norm": 0.7919918083080788, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 34029 + }, + { + "epoch": 0.3403, + "grad_norm": 0.6635644161056327, + "learning_rate": 0.003, + "loss": 4.062, + "step": 34030 + }, + { + "epoch": 0.34031, + "grad_norm": 0.7348682684672848, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 34031 + }, + { + "epoch": 0.34032, + "grad_norm": 0.8345681208634823, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 34032 + }, + { + "epoch": 0.34033, + "grad_norm": 1.0080460432351017, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 34033 + }, + { + "epoch": 0.34034, + "grad_norm": 1.0546084720353353, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 34034 + }, + { + "epoch": 0.34035, + "grad_norm": 0.9627829127313069, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 34035 + }, + { + "epoch": 0.34036, + "grad_norm": 0.9231483428389341, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 34036 + }, + { + "epoch": 0.34037, + "grad_norm": 0.9560914068311439, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 34037 + }, + { + "epoch": 0.34038, + "grad_norm": 0.8899091765631013, + "learning_rate": 0.003, + "loss": 4.028, + "step": 34038 + }, + { + "epoch": 0.34039, + "grad_norm": 0.7990126485273582, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 34039 + }, + { + "epoch": 0.3404, + "grad_norm": 0.8140745802549377, + "learning_rate": 0.003, + "loss": 4.034, + "step": 34040 + }, + { + "epoch": 0.34041, + "grad_norm": 0.8571463368294632, + "learning_rate": 0.003, + "loss": 3.9717, + "step": 34041 + }, + { + "epoch": 0.34042, + "grad_norm": 0.9846784498839475, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 34042 + }, + { + "epoch": 0.34043, + "grad_norm": 1.0988393678946655, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 34043 + }, + { + "epoch": 0.34044, + "grad_norm": 1.141264938068107, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 34044 + }, + { + "epoch": 0.34045, + "grad_norm": 0.9291993666282642, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 34045 + }, + { + "epoch": 0.34046, + "grad_norm": 0.8216586907874481, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 34046 + }, + { + "epoch": 0.34047, + "grad_norm": 0.8771715725689065, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 34047 + }, + { + "epoch": 0.34048, + "grad_norm": 0.9754138697928109, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 34048 + }, + { + "epoch": 0.34049, + "grad_norm": 1.0165560969133782, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 34049 + }, + { + "epoch": 0.3405, + "grad_norm": 0.9512061188390406, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 34050 + }, + { + "epoch": 0.34051, + "grad_norm": 0.9573857064088921, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 34051 + }, + { + "epoch": 0.34052, + "grad_norm": 1.0567655719933524, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 34052 + }, + { + "epoch": 0.34053, + "grad_norm": 0.9456061501177918, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 34053 + }, + { + "epoch": 0.34054, + "grad_norm": 0.891822733391886, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 34054 + }, + { + "epoch": 0.34055, + "grad_norm": 1.066353289188147, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 34055 + }, + { + "epoch": 0.34056, + "grad_norm": 0.9921176709983363, + "learning_rate": 0.003, + "loss": 4.064, + "step": 34056 + }, + { + "epoch": 0.34057, + "grad_norm": 0.8443572215065228, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 34057 + }, + { + "epoch": 0.34058, + "grad_norm": 0.6659394578129543, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 34058 + }, + { + "epoch": 0.34059, + "grad_norm": 0.642632778360411, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 34059 + }, + { + "epoch": 0.3406, + "grad_norm": 0.6846821669390213, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 34060 + }, + { + "epoch": 0.34061, + "grad_norm": 0.6157083788952982, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 34061 + }, + { + "epoch": 0.34062, + "grad_norm": 0.5995327449973084, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 34062 + }, + { + "epoch": 0.34063, + "grad_norm": 0.6390337631402602, + "learning_rate": 0.003, + "loss": 4.051, + "step": 34063 + }, + { + "epoch": 0.34064, + "grad_norm": 0.6862217919974073, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 34064 + }, + { + "epoch": 0.34065, + "grad_norm": 0.7079321973795291, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 34065 + }, + { + "epoch": 0.34066, + "grad_norm": 0.7422396070746735, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 34066 + }, + { + "epoch": 0.34067, + "grad_norm": 0.7694410431965949, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 34067 + }, + { + "epoch": 0.34068, + "grad_norm": 0.7298912418147402, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 34068 + }, + { + "epoch": 0.34069, + "grad_norm": 0.696396097562659, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 34069 + }, + { + "epoch": 0.3407, + "grad_norm": 0.8039333358037213, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 34070 + }, + { + "epoch": 0.34071, + "grad_norm": 0.9188471858299033, + "learning_rate": 0.003, + "loss": 4.043, + "step": 34071 + }, + { + "epoch": 0.34072, + "grad_norm": 0.9335020564533633, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 34072 + }, + { + "epoch": 0.34073, + "grad_norm": 0.7619691598775563, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 34073 + }, + { + "epoch": 0.34074, + "grad_norm": 0.5664050427702848, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 34074 + }, + { + "epoch": 0.34075, + "grad_norm": 0.5991461087743574, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 34075 + }, + { + "epoch": 0.34076, + "grad_norm": 0.7252386777453789, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 34076 + }, + { + "epoch": 0.34077, + "grad_norm": 0.7563183933703501, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 34077 + }, + { + "epoch": 0.34078, + "grad_norm": 0.8165078555796453, + "learning_rate": 0.003, + "loss": 4.034, + "step": 34078 + }, + { + "epoch": 0.34079, + "grad_norm": 0.8131740459003255, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 34079 + }, + { + "epoch": 0.3408, + "grad_norm": 0.7191276769288464, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 34080 + }, + { + "epoch": 0.34081, + "grad_norm": 0.6501357944924261, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 34081 + }, + { + "epoch": 0.34082, + "grad_norm": 0.5499518574261527, + "learning_rate": 0.003, + "loss": 4.032, + "step": 34082 + }, + { + "epoch": 0.34083, + "grad_norm": 0.5727021702499225, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 34083 + }, + { + "epoch": 0.34084, + "grad_norm": 0.6434500357256907, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 34084 + }, + { + "epoch": 0.34085, + "grad_norm": 0.8472931670590771, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 34085 + }, + { + "epoch": 0.34086, + "grad_norm": 1.1837189357808138, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 34086 + }, + { + "epoch": 0.34087, + "grad_norm": 0.9735768700501405, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 34087 + }, + { + "epoch": 0.34088, + "grad_norm": 0.960727559172683, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 34088 + }, + { + "epoch": 0.34089, + "grad_norm": 1.0232946868386503, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 34089 + }, + { + "epoch": 0.3409, + "grad_norm": 0.8598195578259243, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 34090 + }, + { + "epoch": 0.34091, + "grad_norm": 0.8561350460659448, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 34091 + }, + { + "epoch": 0.34092, + "grad_norm": 0.9100934333074193, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 34092 + }, + { + "epoch": 0.34093, + "grad_norm": 0.8624772276279844, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 34093 + }, + { + "epoch": 0.34094, + "grad_norm": 0.8473210067197896, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 34094 + }, + { + "epoch": 0.34095, + "grad_norm": 0.8804047951294056, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 34095 + }, + { + "epoch": 0.34096, + "grad_norm": 0.9525547616152052, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 34096 + }, + { + "epoch": 0.34097, + "grad_norm": 0.9265206179146316, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 34097 + }, + { + "epoch": 0.34098, + "grad_norm": 0.8123424376342245, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 34098 + }, + { + "epoch": 0.34099, + "grad_norm": 0.7773731154550838, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 34099 + }, + { + "epoch": 0.341, + "grad_norm": 0.7897351015906173, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 34100 + }, + { + "epoch": 0.34101, + "grad_norm": 0.7565509952428561, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 34101 + }, + { + "epoch": 0.34102, + "grad_norm": 0.8756526847494094, + "learning_rate": 0.003, + "loss": 4.052, + "step": 34102 + }, + { + "epoch": 0.34103, + "grad_norm": 1.1298907259418725, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 34103 + }, + { + "epoch": 0.34104, + "grad_norm": 0.8588107269951857, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 34104 + }, + { + "epoch": 0.34105, + "grad_norm": 0.7267037468687892, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 34105 + }, + { + "epoch": 0.34106, + "grad_norm": 0.6482819078968429, + "learning_rate": 0.003, + "loss": 4.024, + "step": 34106 + }, + { + "epoch": 0.34107, + "grad_norm": 0.5770128843700393, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 34107 + }, + { + "epoch": 0.34108, + "grad_norm": 0.5558906508416201, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 34108 + }, + { + "epoch": 0.34109, + "grad_norm": 0.5618263887178572, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 34109 + }, + { + "epoch": 0.3411, + "grad_norm": 0.6417931560600224, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 34110 + }, + { + "epoch": 0.34111, + "grad_norm": 0.6653043732172801, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 34111 + }, + { + "epoch": 0.34112, + "grad_norm": 0.8177576843996563, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 34112 + }, + { + "epoch": 0.34113, + "grad_norm": 1.009209524088005, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 34113 + }, + { + "epoch": 0.34114, + "grad_norm": 1.2105239173830795, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 34114 + }, + { + "epoch": 0.34115, + "grad_norm": 0.8670529695453328, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 34115 + }, + { + "epoch": 0.34116, + "grad_norm": 0.8026099400566107, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 34116 + }, + { + "epoch": 0.34117, + "grad_norm": 0.7474280072851388, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 34117 + }, + { + "epoch": 0.34118, + "grad_norm": 0.7166166608726242, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 34118 + }, + { + "epoch": 0.34119, + "grad_norm": 0.6939769902389634, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 34119 + }, + { + "epoch": 0.3412, + "grad_norm": 0.7100201665571866, + "learning_rate": 0.003, + "loss": 3.9947, + "step": 34120 + }, + { + "epoch": 0.34121, + "grad_norm": 0.7510456913023875, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 34121 + }, + { + "epoch": 0.34122, + "grad_norm": 0.826391508852153, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 34122 + }, + { + "epoch": 0.34123, + "grad_norm": 0.8238184220538075, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 34123 + }, + { + "epoch": 0.34124, + "grad_norm": 0.7036154175466772, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 34124 + }, + { + "epoch": 0.34125, + "grad_norm": 0.70798884496288, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 34125 + }, + { + "epoch": 0.34126, + "grad_norm": 0.7434099770722953, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 34126 + }, + { + "epoch": 0.34127, + "grad_norm": 0.8771941891752033, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 34127 + }, + { + "epoch": 0.34128, + "grad_norm": 1.1911679237448496, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 34128 + }, + { + "epoch": 0.34129, + "grad_norm": 0.8597111924644183, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 34129 + }, + { + "epoch": 0.3413, + "grad_norm": 0.8009334956746131, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 34130 + }, + { + "epoch": 0.34131, + "grad_norm": 0.8834728711698868, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 34131 + }, + { + "epoch": 0.34132, + "grad_norm": 0.875896101013776, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 34132 + }, + { + "epoch": 0.34133, + "grad_norm": 0.8493992695546244, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 34133 + }, + { + "epoch": 0.34134, + "grad_norm": 0.9216079654849639, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 34134 + }, + { + "epoch": 0.34135, + "grad_norm": 0.9320815039231054, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 34135 + }, + { + "epoch": 0.34136, + "grad_norm": 1.0271660256908617, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 34136 + }, + { + "epoch": 0.34137, + "grad_norm": 1.0819310354396876, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 34137 + }, + { + "epoch": 0.34138, + "grad_norm": 0.8406774328826657, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 34138 + }, + { + "epoch": 0.34139, + "grad_norm": 0.930939794708705, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 34139 + }, + { + "epoch": 0.3414, + "grad_norm": 0.9348035174712852, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 34140 + }, + { + "epoch": 0.34141, + "grad_norm": 1.0572559013801959, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 34141 + }, + { + "epoch": 0.34142, + "grad_norm": 0.8560503341282029, + "learning_rate": 0.003, + "loss": 4.033, + "step": 34142 + }, + { + "epoch": 0.34143, + "grad_norm": 0.7722441740829369, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 34143 + }, + { + "epoch": 0.34144, + "grad_norm": 0.8219747528847562, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 34144 + }, + { + "epoch": 0.34145, + "grad_norm": 0.8958487417277264, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 34145 + }, + { + "epoch": 0.34146, + "grad_norm": 1.0690156125225527, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 34146 + }, + { + "epoch": 0.34147, + "grad_norm": 1.3689574985764732, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 34147 + }, + { + "epoch": 0.34148, + "grad_norm": 0.5994807302880562, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 34148 + }, + { + "epoch": 0.34149, + "grad_norm": 0.7049412081228258, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 34149 + }, + { + "epoch": 0.3415, + "grad_norm": 0.8585650402085708, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 34150 + }, + { + "epoch": 0.34151, + "grad_norm": 1.0825769961444125, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 34151 + }, + { + "epoch": 0.34152, + "grad_norm": 0.9241934280552466, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 34152 + }, + { + "epoch": 0.34153, + "grad_norm": 0.8366249375605892, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 34153 + }, + { + "epoch": 0.34154, + "grad_norm": 0.7013071183313523, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 34154 + }, + { + "epoch": 0.34155, + "grad_norm": 0.6041906261293323, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 34155 + }, + { + "epoch": 0.34156, + "grad_norm": 0.7402391278605512, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 34156 + }, + { + "epoch": 0.34157, + "grad_norm": 0.7990377051906979, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 34157 + }, + { + "epoch": 0.34158, + "grad_norm": 0.9276508140199522, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 34158 + }, + { + "epoch": 0.34159, + "grad_norm": 1.032593761516333, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 34159 + }, + { + "epoch": 0.3416, + "grad_norm": 0.8636707114485934, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 34160 + }, + { + "epoch": 0.34161, + "grad_norm": 0.788374302409921, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 34161 + }, + { + "epoch": 0.34162, + "grad_norm": 0.7413028408943177, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 34162 + }, + { + "epoch": 0.34163, + "grad_norm": 0.6995780315432757, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 34163 + }, + { + "epoch": 0.34164, + "grad_norm": 0.6679726440798593, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 34164 + }, + { + "epoch": 0.34165, + "grad_norm": 0.749514724162423, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 34165 + }, + { + "epoch": 0.34166, + "grad_norm": 1.0415305841149887, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 34166 + }, + { + "epoch": 0.34167, + "grad_norm": 1.2616302331557625, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 34167 + }, + { + "epoch": 0.34168, + "grad_norm": 0.8446072718374869, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 34168 + }, + { + "epoch": 0.34169, + "grad_norm": 0.7913534884000306, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 34169 + }, + { + "epoch": 0.3417, + "grad_norm": 0.7455098656572278, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 34170 + }, + { + "epoch": 0.34171, + "grad_norm": 0.7576731268537117, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 34171 + }, + { + "epoch": 0.34172, + "grad_norm": 0.8154926551929459, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 34172 + }, + { + "epoch": 0.34173, + "grad_norm": 0.9365266264737857, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 34173 + }, + { + "epoch": 0.34174, + "grad_norm": 0.9724254325166384, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 34174 + }, + { + "epoch": 0.34175, + "grad_norm": 0.9108502400625214, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 34175 + }, + { + "epoch": 0.34176, + "grad_norm": 0.9737213543426644, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 34176 + }, + { + "epoch": 0.34177, + "grad_norm": 1.1966532269912842, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 34177 + }, + { + "epoch": 0.34178, + "grad_norm": 0.888633356587651, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 34178 + }, + { + "epoch": 0.34179, + "grad_norm": 0.8346376101778342, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 34179 + }, + { + "epoch": 0.3418, + "grad_norm": 0.8115699936073141, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 34180 + }, + { + "epoch": 0.34181, + "grad_norm": 0.8815818334363897, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 34181 + }, + { + "epoch": 0.34182, + "grad_norm": 0.9259141402651386, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 34182 + }, + { + "epoch": 0.34183, + "grad_norm": 0.8537826835310705, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 34183 + }, + { + "epoch": 0.34184, + "grad_norm": 0.9241610101979926, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 34184 + }, + { + "epoch": 0.34185, + "grad_norm": 0.9901197968333971, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 34185 + }, + { + "epoch": 0.34186, + "grad_norm": 0.9654579870254195, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 34186 + }, + { + "epoch": 0.34187, + "grad_norm": 0.8881702859059837, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 34187 + }, + { + "epoch": 0.34188, + "grad_norm": 0.7987541930638775, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 34188 + }, + { + "epoch": 0.34189, + "grad_norm": 0.7104788724639892, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 34189 + }, + { + "epoch": 0.3419, + "grad_norm": 0.772970697093058, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 34190 + }, + { + "epoch": 0.34191, + "grad_norm": 0.8264135686570176, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 34191 + }, + { + "epoch": 0.34192, + "grad_norm": 0.7285398936370029, + "learning_rate": 0.003, + "loss": 4.039, + "step": 34192 + }, + { + "epoch": 0.34193, + "grad_norm": 0.7653744786091489, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 34193 + }, + { + "epoch": 0.34194, + "grad_norm": 0.7511782282319743, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 34194 + }, + { + "epoch": 0.34195, + "grad_norm": 0.830378425248287, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 34195 + }, + { + "epoch": 0.34196, + "grad_norm": 0.9757630249615244, + "learning_rate": 0.003, + "loss": 4.031, + "step": 34196 + }, + { + "epoch": 0.34197, + "grad_norm": 1.0666845351141039, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 34197 + }, + { + "epoch": 0.34198, + "grad_norm": 0.8178727455250759, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 34198 + }, + { + "epoch": 0.34199, + "grad_norm": 0.5681718769843592, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 34199 + }, + { + "epoch": 0.342, + "grad_norm": 0.6054103724911871, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 34200 + }, + { + "epoch": 0.34201, + "grad_norm": 0.583388878685876, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 34201 + }, + { + "epoch": 0.34202, + "grad_norm": 0.6253589025203606, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 34202 + }, + { + "epoch": 0.34203, + "grad_norm": 0.6493058510088409, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 34203 + }, + { + "epoch": 0.34204, + "grad_norm": 0.63778945452355, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 34204 + }, + { + "epoch": 0.34205, + "grad_norm": 0.769849271470049, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 34205 + }, + { + "epoch": 0.34206, + "grad_norm": 0.9723975282050301, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 34206 + }, + { + "epoch": 0.34207, + "grad_norm": 1.082945186616203, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 34207 + }, + { + "epoch": 0.34208, + "grad_norm": 0.98318130861273, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 34208 + }, + { + "epoch": 0.34209, + "grad_norm": 1.2282167641912864, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 34209 + }, + { + "epoch": 0.3421, + "grad_norm": 0.8958423677019361, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 34210 + }, + { + "epoch": 0.34211, + "grad_norm": 0.8478548503732254, + "learning_rate": 0.003, + "loss": 4.063, + "step": 34211 + }, + { + "epoch": 0.34212, + "grad_norm": 0.6949704387137674, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 34212 + }, + { + "epoch": 0.34213, + "grad_norm": 0.7417330302214048, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 34213 + }, + { + "epoch": 0.34214, + "grad_norm": 0.7420227201950665, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 34214 + }, + { + "epoch": 0.34215, + "grad_norm": 0.6378521754983429, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 34215 + }, + { + "epoch": 0.34216, + "grad_norm": 0.689544028938562, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 34216 + }, + { + "epoch": 0.34217, + "grad_norm": 0.7086479157903997, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 34217 + }, + { + "epoch": 0.34218, + "grad_norm": 0.7673848374100676, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 34218 + }, + { + "epoch": 0.34219, + "grad_norm": 0.816789259412931, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 34219 + }, + { + "epoch": 0.3422, + "grad_norm": 0.8877423433284022, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 34220 + }, + { + "epoch": 0.34221, + "grad_norm": 1.0107137831482358, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 34221 + }, + { + "epoch": 0.34222, + "grad_norm": 1.0337129380165264, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 34222 + }, + { + "epoch": 0.34223, + "grad_norm": 0.8624236572021852, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 34223 + }, + { + "epoch": 0.34224, + "grad_norm": 0.6574268306301482, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 34224 + }, + { + "epoch": 0.34225, + "grad_norm": 0.5699253974519912, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 34225 + }, + { + "epoch": 0.34226, + "grad_norm": 0.5742641829064818, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 34226 + }, + { + "epoch": 0.34227, + "grad_norm": 0.6445195732943844, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 34227 + }, + { + "epoch": 0.34228, + "grad_norm": 0.7741742423700957, + "learning_rate": 0.003, + "loss": 4.017, + "step": 34228 + }, + { + "epoch": 0.34229, + "grad_norm": 1.0830562373713817, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 34229 + }, + { + "epoch": 0.3423, + "grad_norm": 1.1508411160115481, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 34230 + }, + { + "epoch": 0.34231, + "grad_norm": 0.7479905326894947, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 34231 + }, + { + "epoch": 0.34232, + "grad_norm": 0.7006662892860149, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 34232 + }, + { + "epoch": 0.34233, + "grad_norm": 0.7284964864633967, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 34233 + }, + { + "epoch": 0.34234, + "grad_norm": 0.8211042755979867, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 34234 + }, + { + "epoch": 0.34235, + "grad_norm": 0.8996660822931208, + "learning_rate": 0.003, + "loss": 4.0066, + "step": 34235 + }, + { + "epoch": 0.34236, + "grad_norm": 0.9307934822601484, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 34236 + }, + { + "epoch": 0.34237, + "grad_norm": 0.9908394542096045, + "learning_rate": 0.003, + "loss": 4.025, + "step": 34237 + }, + { + "epoch": 0.34238, + "grad_norm": 0.8665372974822444, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 34238 + }, + { + "epoch": 0.34239, + "grad_norm": 0.8144994677925155, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 34239 + }, + { + "epoch": 0.3424, + "grad_norm": 0.8615312584944728, + "learning_rate": 0.003, + "loss": 4.025, + "step": 34240 + }, + { + "epoch": 0.34241, + "grad_norm": 0.8268039419964168, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 34241 + }, + { + "epoch": 0.34242, + "grad_norm": 0.7131123957611836, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 34242 + }, + { + "epoch": 0.34243, + "grad_norm": 0.8036957144140716, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 34243 + }, + { + "epoch": 0.34244, + "grad_norm": 0.8958642843159621, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 34244 + }, + { + "epoch": 0.34245, + "grad_norm": 0.9980564249085984, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 34245 + }, + { + "epoch": 0.34246, + "grad_norm": 1.012051849020208, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 34246 + }, + { + "epoch": 0.34247, + "grad_norm": 1.0778167626269333, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 34247 + }, + { + "epoch": 0.34248, + "grad_norm": 0.9765579892746566, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 34248 + }, + { + "epoch": 0.34249, + "grad_norm": 0.8649853649720828, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 34249 + }, + { + "epoch": 0.3425, + "grad_norm": 0.7975365810670239, + "learning_rate": 0.003, + "loss": 4.035, + "step": 34250 + }, + { + "epoch": 0.34251, + "grad_norm": 0.7151677310016729, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 34251 + }, + { + "epoch": 0.34252, + "grad_norm": 0.7078988911251008, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 34252 + }, + { + "epoch": 0.34253, + "grad_norm": 0.7918740097700563, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 34253 + }, + { + "epoch": 0.34254, + "grad_norm": 0.9367906909600723, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 34254 + }, + { + "epoch": 0.34255, + "grad_norm": 1.0754901178345573, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 34255 + }, + { + "epoch": 0.34256, + "grad_norm": 1.0511790133931227, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 34256 + }, + { + "epoch": 0.34257, + "grad_norm": 0.9128938945746193, + "learning_rate": 0.003, + "loss": 4.038, + "step": 34257 + }, + { + "epoch": 0.34258, + "grad_norm": 0.7837822125276601, + "learning_rate": 0.003, + "loss": 4.053, + "step": 34258 + }, + { + "epoch": 0.34259, + "grad_norm": 0.7319731593079192, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 34259 + }, + { + "epoch": 0.3426, + "grad_norm": 0.8052981107971173, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 34260 + }, + { + "epoch": 0.34261, + "grad_norm": 0.7960667158634984, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 34261 + }, + { + "epoch": 0.34262, + "grad_norm": 0.8283638397118233, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 34262 + }, + { + "epoch": 0.34263, + "grad_norm": 1.049207409215749, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 34263 + }, + { + "epoch": 0.34264, + "grad_norm": 1.087561548757546, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 34264 + }, + { + "epoch": 0.34265, + "grad_norm": 0.8660800275215281, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 34265 + }, + { + "epoch": 0.34266, + "grad_norm": 0.7108522858875274, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 34266 + }, + { + "epoch": 0.34267, + "grad_norm": 0.6790127019678809, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 34267 + }, + { + "epoch": 0.34268, + "grad_norm": 0.5172674225151003, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 34268 + }, + { + "epoch": 0.34269, + "grad_norm": 0.5516572579737573, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 34269 + }, + { + "epoch": 0.3427, + "grad_norm": 0.4875027218016631, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 34270 + }, + { + "epoch": 0.34271, + "grad_norm": 0.49162549557791874, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 34271 + }, + { + "epoch": 0.34272, + "grad_norm": 0.5669864914452368, + "learning_rate": 0.003, + "loss": 4.007, + "step": 34272 + }, + { + "epoch": 0.34273, + "grad_norm": 0.6400140131502341, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 34273 + }, + { + "epoch": 0.34274, + "grad_norm": 0.7492609919725377, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 34274 + }, + { + "epoch": 0.34275, + "grad_norm": 0.6632336496903914, + "learning_rate": 0.003, + "loss": 4.0051, + "step": 34275 + }, + { + "epoch": 0.34276, + "grad_norm": 0.6201661330325717, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 34276 + }, + { + "epoch": 0.34277, + "grad_norm": 0.7305374742525537, + "learning_rate": 0.003, + "loss": 4.0008, + "step": 34277 + }, + { + "epoch": 0.34278, + "grad_norm": 0.9853924506892965, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 34278 + }, + { + "epoch": 0.34279, + "grad_norm": 1.3939815351793787, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 34279 + }, + { + "epoch": 0.3428, + "grad_norm": 0.5708547303721172, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 34280 + }, + { + "epoch": 0.34281, + "grad_norm": 0.8135471227141484, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 34281 + }, + { + "epoch": 0.34282, + "grad_norm": 0.959858579324949, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 34282 + }, + { + "epoch": 0.34283, + "grad_norm": 0.8405824977296205, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 34283 + }, + { + "epoch": 0.34284, + "grad_norm": 0.7954336512035353, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 34284 + }, + { + "epoch": 0.34285, + "grad_norm": 0.9188836007621363, + "learning_rate": 0.003, + "loss": 4.022, + "step": 34285 + }, + { + "epoch": 0.34286, + "grad_norm": 0.8535824582432092, + "learning_rate": 0.003, + "loss": 4.074, + "step": 34286 + }, + { + "epoch": 0.34287, + "grad_norm": 0.799695422441215, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 34287 + }, + { + "epoch": 0.34288, + "grad_norm": 0.9174510006166071, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 34288 + }, + { + "epoch": 0.34289, + "grad_norm": 0.9391807684809665, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 34289 + }, + { + "epoch": 0.3429, + "grad_norm": 0.8578236790706937, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 34290 + }, + { + "epoch": 0.34291, + "grad_norm": 0.8729531764495353, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 34291 + }, + { + "epoch": 0.34292, + "grad_norm": 0.9433296627524341, + "learning_rate": 0.003, + "loss": 4.048, + "step": 34292 + }, + { + "epoch": 0.34293, + "grad_norm": 1.0635091446047824, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 34293 + }, + { + "epoch": 0.34294, + "grad_norm": 0.9978057822497826, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 34294 + }, + { + "epoch": 0.34295, + "grad_norm": 1.07225375221665, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 34295 + }, + { + "epoch": 0.34296, + "grad_norm": 0.9347544182003088, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 34296 + }, + { + "epoch": 0.34297, + "grad_norm": 0.934710529614385, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 34297 + }, + { + "epoch": 0.34298, + "grad_norm": 1.0179271556328928, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 34298 + }, + { + "epoch": 0.34299, + "grad_norm": 1.0049584509086138, + "learning_rate": 0.003, + "loss": 4.06, + "step": 34299 + }, + { + "epoch": 0.343, + "grad_norm": 0.9525167737102912, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 34300 + }, + { + "epoch": 0.34301, + "grad_norm": 1.1083105118296863, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 34301 + }, + { + "epoch": 0.34302, + "grad_norm": 0.9471370460009991, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 34302 + }, + { + "epoch": 0.34303, + "grad_norm": 0.8181247966331232, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 34303 + }, + { + "epoch": 0.34304, + "grad_norm": 0.7220401322600519, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 34304 + }, + { + "epoch": 0.34305, + "grad_norm": 0.6766740733035059, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 34305 + }, + { + "epoch": 0.34306, + "grad_norm": 0.62972985859045, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 34306 + }, + { + "epoch": 0.34307, + "grad_norm": 0.5935099991824007, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 34307 + }, + { + "epoch": 0.34308, + "grad_norm": 0.6586530179350258, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 34308 + }, + { + "epoch": 0.34309, + "grad_norm": 0.7947822828267335, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 34309 + }, + { + "epoch": 0.3431, + "grad_norm": 0.8755811774071334, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 34310 + }, + { + "epoch": 0.34311, + "grad_norm": 0.8978993923874248, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 34311 + }, + { + "epoch": 0.34312, + "grad_norm": 0.8065217673946592, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 34312 + }, + { + "epoch": 0.34313, + "grad_norm": 0.7013078414032896, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 34313 + }, + { + "epoch": 0.34314, + "grad_norm": 0.5714876054591733, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 34314 + }, + { + "epoch": 0.34315, + "grad_norm": 0.5551558743592503, + "learning_rate": 0.003, + "loss": 4.013, + "step": 34315 + }, + { + "epoch": 0.34316, + "grad_norm": 0.6775217475244516, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 34316 + }, + { + "epoch": 0.34317, + "grad_norm": 0.8237157751720057, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 34317 + }, + { + "epoch": 0.34318, + "grad_norm": 0.8923689250664276, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 34318 + }, + { + "epoch": 0.34319, + "grad_norm": 0.8845253668501907, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 34319 + }, + { + "epoch": 0.3432, + "grad_norm": 0.9120541948215466, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 34320 + }, + { + "epoch": 0.34321, + "grad_norm": 1.0184850553867384, + "learning_rate": 0.003, + "loss": 4.052, + "step": 34321 + }, + { + "epoch": 0.34322, + "grad_norm": 1.0133093429008453, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 34322 + }, + { + "epoch": 0.34323, + "grad_norm": 0.9505683264218208, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 34323 + }, + { + "epoch": 0.34324, + "grad_norm": 1.0555282740390874, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 34324 + }, + { + "epoch": 0.34325, + "grad_norm": 1.1125853493003732, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 34325 + }, + { + "epoch": 0.34326, + "grad_norm": 0.9741249648896545, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 34326 + }, + { + "epoch": 0.34327, + "grad_norm": 0.9218446144112232, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 34327 + }, + { + "epoch": 0.34328, + "grad_norm": 0.908394672496767, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 34328 + }, + { + "epoch": 0.34329, + "grad_norm": 0.9215306116820904, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 34329 + }, + { + "epoch": 0.3433, + "grad_norm": 0.8035282425982846, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 34330 + }, + { + "epoch": 0.34331, + "grad_norm": 0.6781373725729692, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 34331 + }, + { + "epoch": 0.34332, + "grad_norm": 0.853842159110138, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 34332 + }, + { + "epoch": 0.34333, + "grad_norm": 0.8921161225298422, + "learning_rate": 0.003, + "loss": 4.0008, + "step": 34333 + }, + { + "epoch": 0.34334, + "grad_norm": 0.8491323110414845, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 34334 + }, + { + "epoch": 0.34335, + "grad_norm": 0.8131831120473866, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 34335 + }, + { + "epoch": 0.34336, + "grad_norm": 0.7265825088326356, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 34336 + }, + { + "epoch": 0.34337, + "grad_norm": 0.7158437676511544, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 34337 + }, + { + "epoch": 0.34338, + "grad_norm": 0.8460344114523077, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 34338 + }, + { + "epoch": 0.34339, + "grad_norm": 0.9168844906636844, + "learning_rate": 0.003, + "loss": 4.051, + "step": 34339 + }, + { + "epoch": 0.3434, + "grad_norm": 0.9224397538726977, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 34340 + }, + { + "epoch": 0.34341, + "grad_norm": 0.7803160686381381, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 34341 + }, + { + "epoch": 0.34342, + "grad_norm": 0.7390765807257162, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 34342 + }, + { + "epoch": 0.34343, + "grad_norm": 0.7302430056878702, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 34343 + }, + { + "epoch": 0.34344, + "grad_norm": 0.715826148209429, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 34344 + }, + { + "epoch": 0.34345, + "grad_norm": 0.6932553081036588, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 34345 + }, + { + "epoch": 0.34346, + "grad_norm": 0.7673599504695867, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 34346 + }, + { + "epoch": 0.34347, + "grad_norm": 0.7865430439887382, + "learning_rate": 0.003, + "loss": 4.031, + "step": 34347 + }, + { + "epoch": 0.34348, + "grad_norm": 0.8232580619952031, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 34348 + }, + { + "epoch": 0.34349, + "grad_norm": 1.0077988383261338, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 34349 + }, + { + "epoch": 0.3435, + "grad_norm": 1.016309087892976, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 34350 + }, + { + "epoch": 0.34351, + "grad_norm": 0.8970888059866468, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 34351 + }, + { + "epoch": 0.34352, + "grad_norm": 0.8809211631931301, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 34352 + }, + { + "epoch": 0.34353, + "grad_norm": 1.1212846917344534, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 34353 + }, + { + "epoch": 0.34354, + "grad_norm": 1.030031316866575, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 34354 + }, + { + "epoch": 0.34355, + "grad_norm": 0.9733970380447158, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 34355 + }, + { + "epoch": 0.34356, + "grad_norm": 0.9853300531237188, + "learning_rate": 0.003, + "loss": 4.045, + "step": 34356 + }, + { + "epoch": 0.34357, + "grad_norm": 0.9736043010167277, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 34357 + }, + { + "epoch": 0.34358, + "grad_norm": 1.0126975615045644, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 34358 + }, + { + "epoch": 0.34359, + "grad_norm": 1.0113224916306265, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 34359 + }, + { + "epoch": 0.3436, + "grad_norm": 0.8717947372457454, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 34360 + }, + { + "epoch": 0.34361, + "grad_norm": 0.7836203436739804, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 34361 + }, + { + "epoch": 0.34362, + "grad_norm": 0.7515575314005516, + "learning_rate": 0.003, + "loss": 4.048, + "step": 34362 + }, + { + "epoch": 0.34363, + "grad_norm": 0.8304115963877164, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 34363 + }, + { + "epoch": 0.34364, + "grad_norm": 0.9506349334457698, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 34364 + }, + { + "epoch": 0.34365, + "grad_norm": 0.9777236455328896, + "learning_rate": 0.003, + "loss": 4.042, + "step": 34365 + }, + { + "epoch": 0.34366, + "grad_norm": 1.1314325156774205, + "learning_rate": 0.003, + "loss": 4.036, + "step": 34366 + }, + { + "epoch": 0.34367, + "grad_norm": 1.0220916338758927, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 34367 + }, + { + "epoch": 0.34368, + "grad_norm": 0.9200834326138174, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 34368 + }, + { + "epoch": 0.34369, + "grad_norm": 0.8014450665199313, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 34369 + }, + { + "epoch": 0.3437, + "grad_norm": 0.8291264520590482, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 34370 + }, + { + "epoch": 0.34371, + "grad_norm": 0.7457737325038994, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 34371 + }, + { + "epoch": 0.34372, + "grad_norm": 0.7643534957043733, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 34372 + }, + { + "epoch": 0.34373, + "grad_norm": 0.6994266266054555, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 34373 + }, + { + "epoch": 0.34374, + "grad_norm": 0.6606628565346713, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 34374 + }, + { + "epoch": 0.34375, + "grad_norm": 0.718675995581735, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 34375 + }, + { + "epoch": 0.34376, + "grad_norm": 0.689593047774557, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 34376 + }, + { + "epoch": 0.34377, + "grad_norm": 0.5739551326274103, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 34377 + }, + { + "epoch": 0.34378, + "grad_norm": 0.4793925227047839, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 34378 + }, + { + "epoch": 0.34379, + "grad_norm": 0.4633989402473253, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 34379 + }, + { + "epoch": 0.3438, + "grad_norm": 0.4239188184978804, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 34380 + }, + { + "epoch": 0.34381, + "grad_norm": 0.4262550684731709, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 34381 + }, + { + "epoch": 0.34382, + "grad_norm": 0.45615950335474575, + "learning_rate": 0.003, + "loss": 3.9908, + "step": 34382 + }, + { + "epoch": 0.34383, + "grad_norm": 0.472199562235203, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 34383 + }, + { + "epoch": 0.34384, + "grad_norm": 0.6236999368583548, + "learning_rate": 0.003, + "loss": 4.045, + "step": 34384 + }, + { + "epoch": 0.34385, + "grad_norm": 0.772775769192427, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 34385 + }, + { + "epoch": 0.34386, + "grad_norm": 0.8766508638928393, + "learning_rate": 0.003, + "loss": 4.0056, + "step": 34386 + }, + { + "epoch": 0.34387, + "grad_norm": 1.0612280814739388, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 34387 + }, + { + "epoch": 0.34388, + "grad_norm": 1.2906864433168301, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 34388 + }, + { + "epoch": 0.34389, + "grad_norm": 0.7721513568124236, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 34389 + }, + { + "epoch": 0.3439, + "grad_norm": 0.7648390138572144, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 34390 + }, + { + "epoch": 0.34391, + "grad_norm": 0.7821557713956991, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 34391 + }, + { + "epoch": 0.34392, + "grad_norm": 0.9552782014054575, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 34392 + }, + { + "epoch": 0.34393, + "grad_norm": 0.9909735813373586, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 34393 + }, + { + "epoch": 0.34394, + "grad_norm": 0.8989240648359856, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 34394 + }, + { + "epoch": 0.34395, + "grad_norm": 0.8570167605812792, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 34395 + }, + { + "epoch": 0.34396, + "grad_norm": 0.9154828822235801, + "learning_rate": 0.003, + "loss": 4.071, + "step": 34396 + }, + { + "epoch": 0.34397, + "grad_norm": 0.9029045413868424, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 34397 + }, + { + "epoch": 0.34398, + "grad_norm": 0.941098220920906, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 34398 + }, + { + "epoch": 0.34399, + "grad_norm": 1.0314394226835948, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 34399 + }, + { + "epoch": 0.344, + "grad_norm": 1.0509736862048664, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 34400 + }, + { + "epoch": 0.34401, + "grad_norm": 0.9011762535245328, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 34401 + }, + { + "epoch": 0.34402, + "grad_norm": 0.9144210967017222, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 34402 + }, + { + "epoch": 0.34403, + "grad_norm": 0.8578916494018558, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 34403 + }, + { + "epoch": 0.34404, + "grad_norm": 0.8447016674093634, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 34404 + }, + { + "epoch": 0.34405, + "grad_norm": 0.9658522291091811, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 34405 + }, + { + "epoch": 0.34406, + "grad_norm": 1.0656648709692087, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 34406 + }, + { + "epoch": 0.34407, + "grad_norm": 1.059295109034186, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 34407 + }, + { + "epoch": 0.34408, + "grad_norm": 1.1293351786126067, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 34408 + }, + { + "epoch": 0.34409, + "grad_norm": 0.8045462094366068, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 34409 + }, + { + "epoch": 0.3441, + "grad_norm": 0.6889670566829941, + "learning_rate": 0.003, + "loss": 4.053, + "step": 34410 + }, + { + "epoch": 0.34411, + "grad_norm": 0.6353553276801132, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 34411 + }, + { + "epoch": 0.34412, + "grad_norm": 0.6480420536539415, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 34412 + }, + { + "epoch": 0.34413, + "grad_norm": 0.6445141408900813, + "learning_rate": 0.003, + "loss": 4.035, + "step": 34413 + }, + { + "epoch": 0.34414, + "grad_norm": 0.638126700908288, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 34414 + }, + { + "epoch": 0.34415, + "grad_norm": 0.7116979656648418, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 34415 + }, + { + "epoch": 0.34416, + "grad_norm": 0.8434477417143457, + "learning_rate": 0.003, + "loss": 4.032, + "step": 34416 + }, + { + "epoch": 0.34417, + "grad_norm": 0.8316456338766718, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 34417 + }, + { + "epoch": 0.34418, + "grad_norm": 0.7071585703980245, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 34418 + }, + { + "epoch": 0.34419, + "grad_norm": 0.6647223070510185, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 34419 + }, + { + "epoch": 0.3442, + "grad_norm": 0.7937624249375839, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 34420 + }, + { + "epoch": 0.34421, + "grad_norm": 0.8363459274095433, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 34421 + }, + { + "epoch": 0.34422, + "grad_norm": 0.8181373119953027, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 34422 + }, + { + "epoch": 0.34423, + "grad_norm": 0.8268951977055664, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 34423 + }, + { + "epoch": 0.34424, + "grad_norm": 0.8390735356167486, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 34424 + }, + { + "epoch": 0.34425, + "grad_norm": 0.8692473541297471, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 34425 + }, + { + "epoch": 0.34426, + "grad_norm": 1.0376042999418795, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 34426 + }, + { + "epoch": 0.34427, + "grad_norm": 1.2091898923807605, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 34427 + }, + { + "epoch": 0.34428, + "grad_norm": 0.7933274911408704, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 34428 + }, + { + "epoch": 0.34429, + "grad_norm": 0.7648897799768679, + "learning_rate": 0.003, + "loss": 4.054, + "step": 34429 + }, + { + "epoch": 0.3443, + "grad_norm": 0.8190916993448014, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 34430 + }, + { + "epoch": 0.34431, + "grad_norm": 0.8945365712701867, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 34431 + }, + { + "epoch": 0.34432, + "grad_norm": 0.9012615478087793, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 34432 + }, + { + "epoch": 0.34433, + "grad_norm": 0.9055400214296948, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 34433 + }, + { + "epoch": 0.34434, + "grad_norm": 0.8531961035640648, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 34434 + }, + { + "epoch": 0.34435, + "grad_norm": 0.8380405087179986, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 34435 + }, + { + "epoch": 0.34436, + "grad_norm": 0.9156856253885731, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 34436 + }, + { + "epoch": 0.34437, + "grad_norm": 0.9008425180649923, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 34437 + }, + { + "epoch": 0.34438, + "grad_norm": 0.9519058120000256, + "learning_rate": 0.003, + "loss": 4.022, + "step": 34438 + }, + { + "epoch": 0.34439, + "grad_norm": 1.0646092287705244, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 34439 + }, + { + "epoch": 0.3444, + "grad_norm": 0.8378762792749237, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 34440 + }, + { + "epoch": 0.34441, + "grad_norm": 0.6485099655237964, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 34441 + }, + { + "epoch": 0.34442, + "grad_norm": 0.6459689250686206, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 34442 + }, + { + "epoch": 0.34443, + "grad_norm": 0.6827979312615772, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 34443 + }, + { + "epoch": 0.34444, + "grad_norm": 0.6395002213577309, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 34444 + }, + { + "epoch": 0.34445, + "grad_norm": 0.6263494258134135, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 34445 + }, + { + "epoch": 0.34446, + "grad_norm": 0.7846406183637684, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 34446 + }, + { + "epoch": 0.34447, + "grad_norm": 0.9496997390306008, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 34447 + }, + { + "epoch": 0.34448, + "grad_norm": 1.066632209520968, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 34448 + }, + { + "epoch": 0.34449, + "grad_norm": 0.8334939772611274, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 34449 + }, + { + "epoch": 0.3445, + "grad_norm": 0.7309908659528728, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 34450 + }, + { + "epoch": 0.34451, + "grad_norm": 0.8099299652602019, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 34451 + }, + { + "epoch": 0.34452, + "grad_norm": 0.9285309097363291, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 34452 + }, + { + "epoch": 0.34453, + "grad_norm": 0.949055266865184, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 34453 + }, + { + "epoch": 0.34454, + "grad_norm": 0.8906996516390296, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 34454 + }, + { + "epoch": 0.34455, + "grad_norm": 0.7579628772437226, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 34455 + }, + { + "epoch": 0.34456, + "grad_norm": 0.7199277560393076, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 34456 + }, + { + "epoch": 0.34457, + "grad_norm": 0.7405526006916023, + "learning_rate": 0.003, + "loss": 4.052, + "step": 34457 + }, + { + "epoch": 0.34458, + "grad_norm": 0.847058536175887, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 34458 + }, + { + "epoch": 0.34459, + "grad_norm": 0.7891104802743765, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 34459 + }, + { + "epoch": 0.3446, + "grad_norm": 0.7403359572905774, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 34460 + }, + { + "epoch": 0.34461, + "grad_norm": 0.7777923144426475, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 34461 + }, + { + "epoch": 0.34462, + "grad_norm": 0.7270321301791535, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 34462 + }, + { + "epoch": 0.34463, + "grad_norm": 0.8351091971797917, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 34463 + }, + { + "epoch": 0.34464, + "grad_norm": 0.8329056104282853, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 34464 + }, + { + "epoch": 0.34465, + "grad_norm": 0.6945163884811114, + "learning_rate": 0.003, + "loss": 3.9968, + "step": 34465 + }, + { + "epoch": 0.34466, + "grad_norm": 0.7823688049564557, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 34466 + }, + { + "epoch": 0.34467, + "grad_norm": 0.8211512364531924, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 34467 + }, + { + "epoch": 0.34468, + "grad_norm": 0.9767848317581035, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 34468 + }, + { + "epoch": 0.34469, + "grad_norm": 1.113259243299537, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 34469 + }, + { + "epoch": 0.3447, + "grad_norm": 0.8945362407203205, + "learning_rate": 0.003, + "loss": 4.0134, + "step": 34470 + }, + { + "epoch": 0.34471, + "grad_norm": 1.021054546601807, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 34471 + }, + { + "epoch": 0.34472, + "grad_norm": 1.028389745168708, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 34472 + }, + { + "epoch": 0.34473, + "grad_norm": 1.0665819796141116, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 34473 + }, + { + "epoch": 0.34474, + "grad_norm": 0.9430507186661142, + "learning_rate": 0.003, + "loss": 4.058, + "step": 34474 + }, + { + "epoch": 0.34475, + "grad_norm": 0.9744671695296545, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 34475 + }, + { + "epoch": 0.34476, + "grad_norm": 0.9440649527200788, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 34476 + }, + { + "epoch": 0.34477, + "grad_norm": 0.9228166552876553, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 34477 + }, + { + "epoch": 0.34478, + "grad_norm": 0.8787752207843391, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 34478 + }, + { + "epoch": 0.34479, + "grad_norm": 0.9585833270810419, + "learning_rate": 0.003, + "loss": 4.0004, + "step": 34479 + }, + { + "epoch": 0.3448, + "grad_norm": 0.913230746739235, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 34480 + }, + { + "epoch": 0.34481, + "grad_norm": 0.8443848023179719, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 34481 + }, + { + "epoch": 0.34482, + "grad_norm": 0.9425020103806578, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 34482 + }, + { + "epoch": 0.34483, + "grad_norm": 1.1006441880868523, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 34483 + }, + { + "epoch": 0.34484, + "grad_norm": 0.823136366930898, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 34484 + }, + { + "epoch": 0.34485, + "grad_norm": 0.6960911088675873, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 34485 + }, + { + "epoch": 0.34486, + "grad_norm": 0.5919128082862295, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 34486 + }, + { + "epoch": 0.34487, + "grad_norm": 0.6504391539004777, + "learning_rate": 0.003, + "loss": 4.026, + "step": 34487 + }, + { + "epoch": 0.34488, + "grad_norm": 0.6791437979013697, + "learning_rate": 0.003, + "loss": 3.9875, + "step": 34488 + }, + { + "epoch": 0.34489, + "grad_norm": 0.75498878784684, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 34489 + }, + { + "epoch": 0.3449, + "grad_norm": 0.8743215113414976, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 34490 + }, + { + "epoch": 0.34491, + "grad_norm": 0.8667212292107455, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 34491 + }, + { + "epoch": 0.34492, + "grad_norm": 0.7704183451240254, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 34492 + }, + { + "epoch": 0.34493, + "grad_norm": 0.630302722729315, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 34493 + }, + { + "epoch": 0.34494, + "grad_norm": 0.5820349432020077, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 34494 + }, + { + "epoch": 0.34495, + "grad_norm": 0.5924345172755775, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 34495 + }, + { + "epoch": 0.34496, + "grad_norm": 0.6162099949796576, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 34496 + }, + { + "epoch": 0.34497, + "grad_norm": 0.7081988987761293, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 34497 + }, + { + "epoch": 0.34498, + "grad_norm": 0.7845977635084048, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 34498 + }, + { + "epoch": 0.34499, + "grad_norm": 0.8925926864490982, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 34499 + }, + { + "epoch": 0.345, + "grad_norm": 1.1076749896547693, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 34500 + }, + { + "epoch": 0.34501, + "grad_norm": 0.9281581851219178, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 34501 + }, + { + "epoch": 0.34502, + "grad_norm": 0.8642861625773332, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 34502 + }, + { + "epoch": 0.34503, + "grad_norm": 1.0046072231737118, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 34503 + }, + { + "epoch": 0.34504, + "grad_norm": 0.9838616773481328, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 34504 + }, + { + "epoch": 0.34505, + "grad_norm": 0.953057029619005, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 34505 + }, + { + "epoch": 0.34506, + "grad_norm": 0.9297301374460281, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 34506 + }, + { + "epoch": 0.34507, + "grad_norm": 0.7620046855883965, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 34507 + }, + { + "epoch": 0.34508, + "grad_norm": 0.6974924947418466, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 34508 + }, + { + "epoch": 0.34509, + "grad_norm": 0.646488825618821, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 34509 + }, + { + "epoch": 0.3451, + "grad_norm": 0.6750048949469442, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 34510 + }, + { + "epoch": 0.34511, + "grad_norm": 0.7280224074959399, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 34511 + }, + { + "epoch": 0.34512, + "grad_norm": 0.7914277567634888, + "learning_rate": 0.003, + "loss": 4.03, + "step": 34512 + }, + { + "epoch": 0.34513, + "grad_norm": 1.0964289554153872, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 34513 + }, + { + "epoch": 0.34514, + "grad_norm": 1.2791377206822097, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 34514 + }, + { + "epoch": 0.34515, + "grad_norm": 0.7700551973076047, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 34515 + }, + { + "epoch": 0.34516, + "grad_norm": 0.8039798203904456, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 34516 + }, + { + "epoch": 0.34517, + "grad_norm": 0.8049864972043989, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 34517 + }, + { + "epoch": 0.34518, + "grad_norm": 0.8346477015817062, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 34518 + }, + { + "epoch": 0.34519, + "grad_norm": 0.8084706744463853, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 34519 + }, + { + "epoch": 0.3452, + "grad_norm": 0.7862310731894386, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 34520 + }, + { + "epoch": 0.34521, + "grad_norm": 0.7845774376710308, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 34521 + }, + { + "epoch": 0.34522, + "grad_norm": 0.8661238083116782, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 34522 + }, + { + "epoch": 0.34523, + "grad_norm": 0.8291039374317084, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 34523 + }, + { + "epoch": 0.34524, + "grad_norm": 0.8582608514630463, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 34524 + }, + { + "epoch": 0.34525, + "grad_norm": 0.9565947836252808, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 34525 + }, + { + "epoch": 0.34526, + "grad_norm": 1.011379152526198, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 34526 + }, + { + "epoch": 0.34527, + "grad_norm": 1.0536115484476516, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 34527 + }, + { + "epoch": 0.34528, + "grad_norm": 0.9223553392970494, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 34528 + }, + { + "epoch": 0.34529, + "grad_norm": 0.8834702568551616, + "learning_rate": 0.003, + "loss": 4.029, + "step": 34529 + }, + { + "epoch": 0.3453, + "grad_norm": 0.9643716282096577, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 34530 + }, + { + "epoch": 0.34531, + "grad_norm": 1.0549261361948874, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 34531 + }, + { + "epoch": 0.34532, + "grad_norm": 0.8681925652959624, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 34532 + }, + { + "epoch": 0.34533, + "grad_norm": 0.8081112366219122, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 34533 + }, + { + "epoch": 0.34534, + "grad_norm": 0.8016640399609277, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 34534 + }, + { + "epoch": 0.34535, + "grad_norm": 0.7256033309801178, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 34535 + }, + { + "epoch": 0.34536, + "grad_norm": 0.7213868952615656, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 34536 + }, + { + "epoch": 0.34537, + "grad_norm": 0.6930666920656228, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 34537 + }, + { + "epoch": 0.34538, + "grad_norm": 0.7418710892844897, + "learning_rate": 0.003, + "loss": 4.033, + "step": 34538 + }, + { + "epoch": 0.34539, + "grad_norm": 0.906730519742983, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 34539 + }, + { + "epoch": 0.3454, + "grad_norm": 1.1408856132068281, + "learning_rate": 0.003, + "loss": 4.048, + "step": 34540 + }, + { + "epoch": 0.34541, + "grad_norm": 0.822724275633429, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 34541 + }, + { + "epoch": 0.34542, + "grad_norm": 0.7215945830847605, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 34542 + }, + { + "epoch": 0.34543, + "grad_norm": 0.7712502921885293, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 34543 + }, + { + "epoch": 0.34544, + "grad_norm": 0.8554056663840541, + "learning_rate": 0.003, + "loss": 4.027, + "step": 34544 + }, + { + "epoch": 0.34545, + "grad_norm": 0.822493298851573, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 34545 + }, + { + "epoch": 0.34546, + "grad_norm": 0.6997806874733459, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 34546 + }, + { + "epoch": 0.34547, + "grad_norm": 0.6465514566310105, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 34547 + }, + { + "epoch": 0.34548, + "grad_norm": 0.7415188196452973, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 34548 + }, + { + "epoch": 0.34549, + "grad_norm": 0.8149508153396584, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 34549 + }, + { + "epoch": 0.3455, + "grad_norm": 0.9039037170837417, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 34550 + }, + { + "epoch": 0.34551, + "grad_norm": 1.09703868368854, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 34551 + }, + { + "epoch": 0.34552, + "grad_norm": 1.112548429801207, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 34552 + }, + { + "epoch": 0.34553, + "grad_norm": 0.8596606829898167, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 34553 + }, + { + "epoch": 0.34554, + "grad_norm": 0.703125732760317, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 34554 + }, + { + "epoch": 0.34555, + "grad_norm": 0.665232441408588, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 34555 + }, + { + "epoch": 0.34556, + "grad_norm": 0.6759826399629765, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 34556 + }, + { + "epoch": 0.34557, + "grad_norm": 0.7933773221043016, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 34557 + }, + { + "epoch": 0.34558, + "grad_norm": 0.8609410667085539, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 34558 + }, + { + "epoch": 0.34559, + "grad_norm": 0.8195135548338098, + "learning_rate": 0.003, + "loss": 4.047, + "step": 34559 + }, + { + "epoch": 0.3456, + "grad_norm": 0.8250583228442626, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 34560 + }, + { + "epoch": 0.34561, + "grad_norm": 0.9270330730108449, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 34561 + }, + { + "epoch": 0.34562, + "grad_norm": 0.9079094707007858, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 34562 + }, + { + "epoch": 0.34563, + "grad_norm": 0.892176790650534, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 34563 + }, + { + "epoch": 0.34564, + "grad_norm": 0.8183163745853229, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 34564 + }, + { + "epoch": 0.34565, + "grad_norm": 0.9102899867379226, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 34565 + }, + { + "epoch": 0.34566, + "grad_norm": 0.9670867455092108, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 34566 + }, + { + "epoch": 0.34567, + "grad_norm": 1.1761193338658684, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 34567 + }, + { + "epoch": 0.34568, + "grad_norm": 0.8410875220313775, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 34568 + }, + { + "epoch": 0.34569, + "grad_norm": 0.6812215398123858, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 34569 + }, + { + "epoch": 0.3457, + "grad_norm": 0.7412583822425433, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 34570 + }, + { + "epoch": 0.34571, + "grad_norm": 0.7463570027262695, + "learning_rate": 0.003, + "loss": 4.041, + "step": 34571 + }, + { + "epoch": 0.34572, + "grad_norm": 0.7732749595965485, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 34572 + }, + { + "epoch": 0.34573, + "grad_norm": 0.733446532799318, + "learning_rate": 0.003, + "loss": 4.0022, + "step": 34573 + }, + { + "epoch": 0.34574, + "grad_norm": 0.5783139804370725, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 34574 + }, + { + "epoch": 0.34575, + "grad_norm": 0.5553829762633938, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 34575 + }, + { + "epoch": 0.34576, + "grad_norm": 0.519178272044254, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 34576 + }, + { + "epoch": 0.34577, + "grad_norm": 0.5400796769481038, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 34577 + }, + { + "epoch": 0.34578, + "grad_norm": 0.6244169251633643, + "learning_rate": 0.003, + "loss": 4.0019, + "step": 34578 + }, + { + "epoch": 0.34579, + "grad_norm": 0.7710071632116319, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 34579 + }, + { + "epoch": 0.3458, + "grad_norm": 0.9574734369924512, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 34580 + }, + { + "epoch": 0.34581, + "grad_norm": 1.0608778730643844, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 34581 + }, + { + "epoch": 0.34582, + "grad_norm": 0.9327007147221428, + "learning_rate": 0.003, + "loss": 4.057, + "step": 34582 + }, + { + "epoch": 0.34583, + "grad_norm": 0.8412969469583328, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 34583 + }, + { + "epoch": 0.34584, + "grad_norm": 0.7916873323799967, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 34584 + }, + { + "epoch": 0.34585, + "grad_norm": 0.9286630699257109, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 34585 + }, + { + "epoch": 0.34586, + "grad_norm": 0.821698764205388, + "learning_rate": 0.003, + "loss": 4.0056, + "step": 34586 + }, + { + "epoch": 0.34587, + "grad_norm": 0.752623117435259, + "learning_rate": 0.003, + "loss": 4.03, + "step": 34587 + }, + { + "epoch": 0.34588, + "grad_norm": 0.7307158467940512, + "learning_rate": 0.003, + "loss": 4.031, + "step": 34588 + }, + { + "epoch": 0.34589, + "grad_norm": 0.8431457537234694, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 34589 + }, + { + "epoch": 0.3459, + "grad_norm": 0.7161184904473243, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 34590 + }, + { + "epoch": 0.34591, + "grad_norm": 0.7055874432909173, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 34591 + }, + { + "epoch": 0.34592, + "grad_norm": 0.6863780238149824, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 34592 + }, + { + "epoch": 0.34593, + "grad_norm": 0.7950832105897014, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 34593 + }, + { + "epoch": 0.34594, + "grad_norm": 1.0942104249739206, + "learning_rate": 0.003, + "loss": 4.056, + "step": 34594 + }, + { + "epoch": 0.34595, + "grad_norm": 1.333949293942147, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 34595 + }, + { + "epoch": 0.34596, + "grad_norm": 0.6479003370107027, + "learning_rate": 0.003, + "loss": 4.067, + "step": 34596 + }, + { + "epoch": 0.34597, + "grad_norm": 0.6703786679202025, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 34597 + }, + { + "epoch": 0.34598, + "grad_norm": 0.6626222824528238, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 34598 + }, + { + "epoch": 0.34599, + "grad_norm": 0.6181534129231927, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 34599 + }, + { + "epoch": 0.346, + "grad_norm": 0.6149757757079107, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 34600 + }, + { + "epoch": 0.34601, + "grad_norm": 0.6210751975947761, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 34601 + }, + { + "epoch": 0.34602, + "grad_norm": 0.6490147228503137, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 34602 + }, + { + "epoch": 0.34603, + "grad_norm": 0.7014481306282548, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 34603 + }, + { + "epoch": 0.34604, + "grad_norm": 0.8595456690331306, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 34604 + }, + { + "epoch": 0.34605, + "grad_norm": 1.0158973415551695, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 34605 + }, + { + "epoch": 0.34606, + "grad_norm": 1.1238570530522944, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 34606 + }, + { + "epoch": 0.34607, + "grad_norm": 0.9983904378473883, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 34607 + }, + { + "epoch": 0.34608, + "grad_norm": 0.9135744003058295, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 34608 + }, + { + "epoch": 0.34609, + "grad_norm": 0.8453986939453443, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 34609 + }, + { + "epoch": 0.3461, + "grad_norm": 0.8334210974995194, + "learning_rate": 0.003, + "loss": 4.031, + "step": 34610 + }, + { + "epoch": 0.34611, + "grad_norm": 0.9457224619920491, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 34611 + }, + { + "epoch": 0.34612, + "grad_norm": 1.0695793809515244, + "learning_rate": 0.003, + "loss": 4.023, + "step": 34612 + }, + { + "epoch": 0.34613, + "grad_norm": 0.962696952853266, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 34613 + }, + { + "epoch": 0.34614, + "grad_norm": 0.9366829546836091, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 34614 + }, + { + "epoch": 0.34615, + "grad_norm": 0.9225906274237546, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 34615 + }, + { + "epoch": 0.34616, + "grad_norm": 0.9356656459165791, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 34616 + }, + { + "epoch": 0.34617, + "grad_norm": 1.0443650388386483, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 34617 + }, + { + "epoch": 0.34618, + "grad_norm": 0.9089256689905104, + "learning_rate": 0.003, + "loss": 4.073, + "step": 34618 + }, + { + "epoch": 0.34619, + "grad_norm": 0.9245679770470324, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 34619 + }, + { + "epoch": 0.3462, + "grad_norm": 0.906816281061801, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 34620 + }, + { + "epoch": 0.34621, + "grad_norm": 0.8900554779205011, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 34621 + }, + { + "epoch": 0.34622, + "grad_norm": 0.9167210755072268, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 34622 + }, + { + "epoch": 0.34623, + "grad_norm": 0.9271246485469452, + "learning_rate": 0.003, + "loss": 4.056, + "step": 34623 + }, + { + "epoch": 0.34624, + "grad_norm": 0.9671650664491138, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 34624 + }, + { + "epoch": 0.34625, + "grad_norm": 1.0878599429961624, + "learning_rate": 0.003, + "loss": 4.055, + "step": 34625 + }, + { + "epoch": 0.34626, + "grad_norm": 0.9730720371756786, + "learning_rate": 0.003, + "loss": 4.077, + "step": 34626 + }, + { + "epoch": 0.34627, + "grad_norm": 0.9880396702064486, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 34627 + }, + { + "epoch": 0.34628, + "grad_norm": 0.8894830436738097, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 34628 + }, + { + "epoch": 0.34629, + "grad_norm": 0.8647243078311875, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 34629 + }, + { + "epoch": 0.3463, + "grad_norm": 0.940285445468824, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 34630 + }, + { + "epoch": 0.34631, + "grad_norm": 1.054761862061168, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 34631 + }, + { + "epoch": 0.34632, + "grad_norm": 0.9956283918936183, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 34632 + }, + { + "epoch": 0.34633, + "grad_norm": 0.9920727029686527, + "learning_rate": 0.003, + "loss": 4.068, + "step": 34633 + }, + { + "epoch": 0.34634, + "grad_norm": 0.9818429444654575, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 34634 + }, + { + "epoch": 0.34635, + "grad_norm": 0.8343387384144243, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 34635 + }, + { + "epoch": 0.34636, + "grad_norm": 0.7037457315731984, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 34636 + }, + { + "epoch": 0.34637, + "grad_norm": 0.6375211942443303, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 34637 + }, + { + "epoch": 0.34638, + "grad_norm": 0.6151064195032665, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 34638 + }, + { + "epoch": 0.34639, + "grad_norm": 0.6225598502319716, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 34639 + }, + { + "epoch": 0.3464, + "grad_norm": 0.5565320308539808, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 34640 + }, + { + "epoch": 0.34641, + "grad_norm": 0.5391187835487762, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 34641 + }, + { + "epoch": 0.34642, + "grad_norm": 0.5304891140402174, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 34642 + }, + { + "epoch": 0.34643, + "grad_norm": 0.53725390236833, + "learning_rate": 0.003, + "loss": 4.039, + "step": 34643 + }, + { + "epoch": 0.34644, + "grad_norm": 0.5373666527301979, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 34644 + }, + { + "epoch": 0.34645, + "grad_norm": 0.5363503966411274, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 34645 + }, + { + "epoch": 0.34646, + "grad_norm": 0.6447024696473392, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 34646 + }, + { + "epoch": 0.34647, + "grad_norm": 0.7033485475880766, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 34647 + }, + { + "epoch": 0.34648, + "grad_norm": 0.7326603503677568, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 34648 + }, + { + "epoch": 0.34649, + "grad_norm": 0.9234486521403993, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 34649 + }, + { + "epoch": 0.3465, + "grad_norm": 1.1905030246273414, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 34650 + }, + { + "epoch": 0.34651, + "grad_norm": 0.8167809875810752, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 34651 + }, + { + "epoch": 0.34652, + "grad_norm": 0.6583152294429505, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 34652 + }, + { + "epoch": 0.34653, + "grad_norm": 0.7267714883663945, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 34653 + }, + { + "epoch": 0.34654, + "grad_norm": 0.8434706300935787, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 34654 + }, + { + "epoch": 0.34655, + "grad_norm": 0.9338549873440369, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 34655 + }, + { + "epoch": 0.34656, + "grad_norm": 1.0213280635295177, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 34656 + }, + { + "epoch": 0.34657, + "grad_norm": 1.1361374853118171, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 34657 + }, + { + "epoch": 0.34658, + "grad_norm": 0.8473788764978512, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 34658 + }, + { + "epoch": 0.34659, + "grad_norm": 0.7906467663928731, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 34659 + }, + { + "epoch": 0.3466, + "grad_norm": 0.6680314017972818, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 34660 + }, + { + "epoch": 0.34661, + "grad_norm": 0.6267316251819774, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 34661 + }, + { + "epoch": 0.34662, + "grad_norm": 0.6906038335237771, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 34662 + }, + { + "epoch": 0.34663, + "grad_norm": 0.7703863127917444, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 34663 + }, + { + "epoch": 0.34664, + "grad_norm": 0.8309692348678631, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 34664 + }, + { + "epoch": 0.34665, + "grad_norm": 1.0428412189437197, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 34665 + }, + { + "epoch": 0.34666, + "grad_norm": 1.012451033553391, + "learning_rate": 0.003, + "loss": 4.076, + "step": 34666 + }, + { + "epoch": 0.34667, + "grad_norm": 0.7973078728052969, + "learning_rate": 0.003, + "loss": 4.073, + "step": 34667 + }, + { + "epoch": 0.34668, + "grad_norm": 0.7654093633267353, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 34668 + }, + { + "epoch": 0.34669, + "grad_norm": 0.7487491333825574, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 34669 + }, + { + "epoch": 0.3467, + "grad_norm": 0.6943802948742906, + "learning_rate": 0.003, + "loss": 4.0058, + "step": 34670 + }, + { + "epoch": 0.34671, + "grad_norm": 0.6145934426615317, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 34671 + }, + { + "epoch": 0.34672, + "grad_norm": 0.6353469729646086, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 34672 + }, + { + "epoch": 0.34673, + "grad_norm": 0.9130658519726988, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 34673 + }, + { + "epoch": 0.34674, + "grad_norm": 1.210078100423299, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 34674 + }, + { + "epoch": 0.34675, + "grad_norm": 0.7404195414523346, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 34675 + }, + { + "epoch": 0.34676, + "grad_norm": 0.6580430913109181, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 34676 + }, + { + "epoch": 0.34677, + "grad_norm": 0.7268081665402026, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 34677 + }, + { + "epoch": 0.34678, + "grad_norm": 0.8115557553891181, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 34678 + }, + { + "epoch": 0.34679, + "grad_norm": 0.885773058727303, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 34679 + }, + { + "epoch": 0.3468, + "grad_norm": 0.8319471666526586, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 34680 + }, + { + "epoch": 0.34681, + "grad_norm": 0.8267729688490805, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 34681 + }, + { + "epoch": 0.34682, + "grad_norm": 0.8343788479570813, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 34682 + }, + { + "epoch": 0.34683, + "grad_norm": 0.8142642524117197, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 34683 + }, + { + "epoch": 0.34684, + "grad_norm": 0.7368165380678396, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 34684 + }, + { + "epoch": 0.34685, + "grad_norm": 0.6959416429170956, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 34685 + }, + { + "epoch": 0.34686, + "grad_norm": 0.6897600147936405, + "learning_rate": 0.003, + "loss": 3.9975, + "step": 34686 + }, + { + "epoch": 0.34687, + "grad_norm": 0.7394440975365838, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 34687 + }, + { + "epoch": 0.34688, + "grad_norm": 0.82523715530707, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 34688 + }, + { + "epoch": 0.34689, + "grad_norm": 0.8574437515414237, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 34689 + }, + { + "epoch": 0.3469, + "grad_norm": 0.943895653666287, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 34690 + }, + { + "epoch": 0.34691, + "grad_norm": 1.0091286049220174, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 34691 + }, + { + "epoch": 0.34692, + "grad_norm": 1.1548239807244807, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 34692 + }, + { + "epoch": 0.34693, + "grad_norm": 0.8357080412700646, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 34693 + }, + { + "epoch": 0.34694, + "grad_norm": 0.7171877625081226, + "learning_rate": 0.003, + "loss": 4.008, + "step": 34694 + }, + { + "epoch": 0.34695, + "grad_norm": 0.7422227148232964, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 34695 + }, + { + "epoch": 0.34696, + "grad_norm": 0.7896766684999509, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 34696 + }, + { + "epoch": 0.34697, + "grad_norm": 0.7988083693700572, + "learning_rate": 0.003, + "loss": 3.9833, + "step": 34697 + }, + { + "epoch": 0.34698, + "grad_norm": 0.8094935048323625, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 34698 + }, + { + "epoch": 0.34699, + "grad_norm": 0.9552294378053499, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 34699 + }, + { + "epoch": 0.347, + "grad_norm": 1.045563797404123, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 34700 + }, + { + "epoch": 0.34701, + "grad_norm": 1.0876783982786937, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 34701 + }, + { + "epoch": 0.34702, + "grad_norm": 0.9982285691774913, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 34702 + }, + { + "epoch": 0.34703, + "grad_norm": 1.0507514051842994, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 34703 + }, + { + "epoch": 0.34704, + "grad_norm": 0.925438513338819, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 34704 + }, + { + "epoch": 0.34705, + "grad_norm": 0.9406906776710916, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 34705 + }, + { + "epoch": 0.34706, + "grad_norm": 0.7965824073342543, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 34706 + }, + { + "epoch": 0.34707, + "grad_norm": 0.7313383107360606, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 34707 + }, + { + "epoch": 0.34708, + "grad_norm": 0.5729166377715145, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 34708 + }, + { + "epoch": 0.34709, + "grad_norm": 0.6134522572149388, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 34709 + }, + { + "epoch": 0.3471, + "grad_norm": 0.6931489569884725, + "learning_rate": 0.003, + "loss": 4.033, + "step": 34710 + }, + { + "epoch": 0.34711, + "grad_norm": 0.9295916027675596, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 34711 + }, + { + "epoch": 0.34712, + "grad_norm": 1.2382729696312988, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 34712 + }, + { + "epoch": 0.34713, + "grad_norm": 0.8174289835703898, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 34713 + }, + { + "epoch": 0.34714, + "grad_norm": 0.7743258962350672, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 34714 + }, + { + "epoch": 0.34715, + "grad_norm": 0.7144251891931662, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 34715 + }, + { + "epoch": 0.34716, + "grad_norm": 0.7155749841536181, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 34716 + }, + { + "epoch": 0.34717, + "grad_norm": 0.561932277594511, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 34717 + }, + { + "epoch": 0.34718, + "grad_norm": 0.5502834180443676, + "learning_rate": 0.003, + "loss": 4.0031, + "step": 34718 + }, + { + "epoch": 0.34719, + "grad_norm": 0.5480694725499755, + "learning_rate": 0.003, + "loss": 4.007, + "step": 34719 + }, + { + "epoch": 0.3472, + "grad_norm": 0.6273215776253827, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 34720 + }, + { + "epoch": 0.34721, + "grad_norm": 0.7584603593059065, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 34721 + }, + { + "epoch": 0.34722, + "grad_norm": 0.81603078100306, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 34722 + }, + { + "epoch": 0.34723, + "grad_norm": 0.8594573437945927, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 34723 + }, + { + "epoch": 0.34724, + "grad_norm": 0.9951991951311653, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 34724 + }, + { + "epoch": 0.34725, + "grad_norm": 1.041162204345738, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 34725 + }, + { + "epoch": 0.34726, + "grad_norm": 0.9239428607476896, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 34726 + }, + { + "epoch": 0.34727, + "grad_norm": 0.9465568236722067, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 34727 + }, + { + "epoch": 0.34728, + "grad_norm": 0.9433310374917266, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 34728 + }, + { + "epoch": 0.34729, + "grad_norm": 1.0748439474844254, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 34729 + }, + { + "epoch": 0.3473, + "grad_norm": 0.9226510694415608, + "learning_rate": 0.003, + "loss": 4.008, + "step": 34730 + }, + { + "epoch": 0.34731, + "grad_norm": 0.7656419682194239, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 34731 + }, + { + "epoch": 0.34732, + "grad_norm": 0.836469575787767, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 34732 + }, + { + "epoch": 0.34733, + "grad_norm": 0.8174535565188165, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 34733 + }, + { + "epoch": 0.34734, + "grad_norm": 0.7917935316776014, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 34734 + }, + { + "epoch": 0.34735, + "grad_norm": 0.9103871409165347, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 34735 + }, + { + "epoch": 0.34736, + "grad_norm": 1.0253272727526552, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 34736 + }, + { + "epoch": 0.34737, + "grad_norm": 1.0005938816867341, + "learning_rate": 0.003, + "loss": 4.077, + "step": 34737 + }, + { + "epoch": 0.34738, + "grad_norm": 0.9042397886112195, + "learning_rate": 0.003, + "loss": 4.077, + "step": 34738 + }, + { + "epoch": 0.34739, + "grad_norm": 0.7530757325860072, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 34739 + }, + { + "epoch": 0.3474, + "grad_norm": 0.840596787096, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 34740 + }, + { + "epoch": 0.34741, + "grad_norm": 0.9256269651664081, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 34741 + }, + { + "epoch": 0.34742, + "grad_norm": 0.9318867064370578, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 34742 + }, + { + "epoch": 0.34743, + "grad_norm": 0.7676936460923819, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 34743 + }, + { + "epoch": 0.34744, + "grad_norm": 0.7296999772557023, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 34744 + }, + { + "epoch": 0.34745, + "grad_norm": 0.712787120356627, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 34745 + }, + { + "epoch": 0.34746, + "grad_norm": 0.835509356460299, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 34746 + }, + { + "epoch": 0.34747, + "grad_norm": 0.9458465800789766, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 34747 + }, + { + "epoch": 0.34748, + "grad_norm": 1.2109992687389635, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 34748 + }, + { + "epoch": 0.34749, + "grad_norm": 0.8745993912754669, + "learning_rate": 0.003, + "loss": 4.035, + "step": 34749 + }, + { + "epoch": 0.3475, + "grad_norm": 0.8908407250534748, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 34750 + }, + { + "epoch": 0.34751, + "grad_norm": 0.8823839356389209, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 34751 + }, + { + "epoch": 0.34752, + "grad_norm": 0.8316918283183974, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 34752 + }, + { + "epoch": 0.34753, + "grad_norm": 0.6936438907768921, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 34753 + }, + { + "epoch": 0.34754, + "grad_norm": 0.5529946924955295, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 34754 + }, + { + "epoch": 0.34755, + "grad_norm": 0.6047875636151336, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 34755 + }, + { + "epoch": 0.34756, + "grad_norm": 0.6940884590116089, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 34756 + }, + { + "epoch": 0.34757, + "grad_norm": 0.7180263609359335, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 34757 + }, + { + "epoch": 0.34758, + "grad_norm": 0.7433459948604162, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 34758 + }, + { + "epoch": 0.34759, + "grad_norm": 0.7606738060953074, + "learning_rate": 0.003, + "loss": 4.044, + "step": 34759 + }, + { + "epoch": 0.3476, + "grad_norm": 0.8412939640293552, + "learning_rate": 0.003, + "loss": 4.035, + "step": 34760 + }, + { + "epoch": 0.34761, + "grad_norm": 0.8368877813487658, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 34761 + }, + { + "epoch": 0.34762, + "grad_norm": 0.9483894419874874, + "learning_rate": 0.003, + "loss": 4.077, + "step": 34762 + }, + { + "epoch": 0.34763, + "grad_norm": 1.1263368545780004, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 34763 + }, + { + "epoch": 0.34764, + "grad_norm": 1.0033864793508824, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 34764 + }, + { + "epoch": 0.34765, + "grad_norm": 0.8768536146889513, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 34765 + }, + { + "epoch": 0.34766, + "grad_norm": 0.8227487190867583, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 34766 + }, + { + "epoch": 0.34767, + "grad_norm": 0.8243665457106375, + "learning_rate": 0.003, + "loss": 4.01, + "step": 34767 + }, + { + "epoch": 0.34768, + "grad_norm": 0.7966291930657166, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 34768 + }, + { + "epoch": 0.34769, + "grad_norm": 0.8748551111439837, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 34769 + }, + { + "epoch": 0.3477, + "grad_norm": 0.9892927715322323, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 34770 + }, + { + "epoch": 0.34771, + "grad_norm": 0.950218747700461, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 34771 + }, + { + "epoch": 0.34772, + "grad_norm": 0.8584039565498719, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 34772 + }, + { + "epoch": 0.34773, + "grad_norm": 0.8669418712782091, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 34773 + }, + { + "epoch": 0.34774, + "grad_norm": 0.7933989289868888, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 34774 + }, + { + "epoch": 0.34775, + "grad_norm": 0.7796433417534827, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 34775 + }, + { + "epoch": 0.34776, + "grad_norm": 0.9833326748327749, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 34776 + }, + { + "epoch": 0.34777, + "grad_norm": 1.2772744338775783, + "learning_rate": 0.003, + "loss": 4.028, + "step": 34777 + }, + { + "epoch": 0.34778, + "grad_norm": 0.7544013510428906, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 34778 + }, + { + "epoch": 0.34779, + "grad_norm": 0.6275361939374325, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 34779 + }, + { + "epoch": 0.3478, + "grad_norm": 0.6064907532218664, + "learning_rate": 0.003, + "loss": 4.075, + "step": 34780 + }, + { + "epoch": 0.34781, + "grad_norm": 0.5767263730471315, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 34781 + }, + { + "epoch": 0.34782, + "grad_norm": 0.5723237641714202, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 34782 + }, + { + "epoch": 0.34783, + "grad_norm": 0.5662318260534049, + "learning_rate": 0.003, + "loss": 4.034, + "step": 34783 + }, + { + "epoch": 0.34784, + "grad_norm": 0.6685311465520342, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 34784 + }, + { + "epoch": 0.34785, + "grad_norm": 0.7023981363431896, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 34785 + }, + { + "epoch": 0.34786, + "grad_norm": 0.6467955636982758, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 34786 + }, + { + "epoch": 0.34787, + "grad_norm": 0.6838634906520717, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 34787 + }, + { + "epoch": 0.34788, + "grad_norm": 0.7455780827863456, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 34788 + }, + { + "epoch": 0.34789, + "grad_norm": 0.8718816447851796, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 34789 + }, + { + "epoch": 0.3479, + "grad_norm": 1.1541168474226622, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 34790 + }, + { + "epoch": 0.34791, + "grad_norm": 1.0879160390794589, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 34791 + }, + { + "epoch": 0.34792, + "grad_norm": 0.8138527950484187, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 34792 + }, + { + "epoch": 0.34793, + "grad_norm": 0.6525770067634966, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 34793 + }, + { + "epoch": 0.34794, + "grad_norm": 0.6547578962947094, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 34794 + }, + { + "epoch": 0.34795, + "grad_norm": 0.7514528002685812, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 34795 + }, + { + "epoch": 0.34796, + "grad_norm": 0.9044656162626779, + "learning_rate": 0.003, + "loss": 4.0071, + "step": 34796 + }, + { + "epoch": 0.34797, + "grad_norm": 1.042847608493205, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 34797 + }, + { + "epoch": 0.34798, + "grad_norm": 0.9049053123567403, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 34798 + }, + { + "epoch": 0.34799, + "grad_norm": 0.8683226658531115, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 34799 + }, + { + "epoch": 0.348, + "grad_norm": 0.7914889806116041, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 34800 + }, + { + "epoch": 0.34801, + "grad_norm": 0.8924653497452334, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 34801 + }, + { + "epoch": 0.34802, + "grad_norm": 1.0624968105762187, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 34802 + }, + { + "epoch": 0.34803, + "grad_norm": 0.8943419126907487, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 34803 + }, + { + "epoch": 0.34804, + "grad_norm": 0.9276623079610434, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 34804 + }, + { + "epoch": 0.34805, + "grad_norm": 1.0710552048905924, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 34805 + }, + { + "epoch": 0.34806, + "grad_norm": 0.916326203789788, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 34806 + }, + { + "epoch": 0.34807, + "grad_norm": 0.8451856335538793, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 34807 + }, + { + "epoch": 0.34808, + "grad_norm": 0.8845098805651169, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 34808 + }, + { + "epoch": 0.34809, + "grad_norm": 0.8298754709042115, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 34809 + }, + { + "epoch": 0.3481, + "grad_norm": 0.7979616576512674, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 34810 + }, + { + "epoch": 0.34811, + "grad_norm": 0.8174193558346711, + "learning_rate": 0.003, + "loss": 4.046, + "step": 34811 + }, + { + "epoch": 0.34812, + "grad_norm": 0.761507673595115, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 34812 + }, + { + "epoch": 0.34813, + "grad_norm": 0.7483840031618263, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 34813 + }, + { + "epoch": 0.34814, + "grad_norm": 0.7162146534626899, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 34814 + }, + { + "epoch": 0.34815, + "grad_norm": 0.7308426874742306, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 34815 + }, + { + "epoch": 0.34816, + "grad_norm": 0.7157343914499259, + "learning_rate": 0.003, + "loss": 4.032, + "step": 34816 + }, + { + "epoch": 0.34817, + "grad_norm": 0.7195677003381832, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 34817 + }, + { + "epoch": 0.34818, + "grad_norm": 0.7783442335972965, + "learning_rate": 0.003, + "loss": 4.047, + "step": 34818 + }, + { + "epoch": 0.34819, + "grad_norm": 0.9598094563927628, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 34819 + }, + { + "epoch": 0.3482, + "grad_norm": 1.145875171675095, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 34820 + }, + { + "epoch": 0.34821, + "grad_norm": 0.9451536437119268, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 34821 + }, + { + "epoch": 0.34822, + "grad_norm": 0.942854331384803, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 34822 + }, + { + "epoch": 0.34823, + "grad_norm": 0.9281956947374154, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 34823 + }, + { + "epoch": 0.34824, + "grad_norm": 0.9662005242479036, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 34824 + }, + { + "epoch": 0.34825, + "grad_norm": 0.9761321112506395, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 34825 + }, + { + "epoch": 0.34826, + "grad_norm": 0.9154788958678094, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 34826 + }, + { + "epoch": 0.34827, + "grad_norm": 0.8913679145479757, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 34827 + }, + { + "epoch": 0.34828, + "grad_norm": 1.046474835439531, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 34828 + }, + { + "epoch": 0.34829, + "grad_norm": 1.0788628762490025, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 34829 + }, + { + "epoch": 0.3483, + "grad_norm": 0.8076254421546771, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 34830 + }, + { + "epoch": 0.34831, + "grad_norm": 0.8076094009374168, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 34831 + }, + { + "epoch": 0.34832, + "grad_norm": 0.7520285133875139, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 34832 + }, + { + "epoch": 0.34833, + "grad_norm": 0.6716349828816769, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 34833 + }, + { + "epoch": 0.34834, + "grad_norm": 0.6245404326115679, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 34834 + }, + { + "epoch": 0.34835, + "grad_norm": 0.5777800436049227, + "learning_rate": 0.003, + "loss": 4.0059, + "step": 34835 + }, + { + "epoch": 0.34836, + "grad_norm": 0.6483968595268288, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 34836 + }, + { + "epoch": 0.34837, + "grad_norm": 0.7814632467617743, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 34837 + }, + { + "epoch": 0.34838, + "grad_norm": 1.0141548948813364, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 34838 + }, + { + "epoch": 0.34839, + "grad_norm": 1.119245299189435, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 34839 + }, + { + "epoch": 0.3484, + "grad_norm": 0.8715540293485823, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 34840 + }, + { + "epoch": 0.34841, + "grad_norm": 0.9020741018243255, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 34841 + }, + { + "epoch": 0.34842, + "grad_norm": 0.9053648074141472, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 34842 + }, + { + "epoch": 0.34843, + "grad_norm": 0.8644757439425724, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 34843 + }, + { + "epoch": 0.34844, + "grad_norm": 0.7756459808539292, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 34844 + }, + { + "epoch": 0.34845, + "grad_norm": 0.70387563374554, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 34845 + }, + { + "epoch": 0.34846, + "grad_norm": 0.6723979992201538, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 34846 + }, + { + "epoch": 0.34847, + "grad_norm": 0.6980881245470795, + "learning_rate": 0.003, + "loss": 4.02, + "step": 34847 + }, + { + "epoch": 0.34848, + "grad_norm": 0.7065682781466058, + "learning_rate": 0.003, + "loss": 4.036, + "step": 34848 + }, + { + "epoch": 0.34849, + "grad_norm": 0.6722477228598817, + "learning_rate": 0.003, + "loss": 4.0006, + "step": 34849 + }, + { + "epoch": 0.3485, + "grad_norm": 0.8120389736326744, + "learning_rate": 0.003, + "loss": 4.022, + "step": 34850 + }, + { + "epoch": 0.34851, + "grad_norm": 0.9922655798716832, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 34851 + }, + { + "epoch": 0.34852, + "grad_norm": 1.0960085176025909, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 34852 + }, + { + "epoch": 0.34853, + "grad_norm": 0.7180626813455017, + "learning_rate": 0.003, + "loss": 4.026, + "step": 34853 + }, + { + "epoch": 0.34854, + "grad_norm": 0.7275656432158131, + "learning_rate": 0.003, + "loss": 4.0004, + "step": 34854 + }, + { + "epoch": 0.34855, + "grad_norm": 0.8587000035685581, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 34855 + }, + { + "epoch": 0.34856, + "grad_norm": 0.7190721139882555, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 34856 + }, + { + "epoch": 0.34857, + "grad_norm": 0.6380061215932387, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 34857 + }, + { + "epoch": 0.34858, + "grad_norm": 0.6221569752572831, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 34858 + }, + { + "epoch": 0.34859, + "grad_norm": 0.5866312562054267, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 34859 + }, + { + "epoch": 0.3486, + "grad_norm": 0.6087183847920858, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 34860 + }, + { + "epoch": 0.34861, + "grad_norm": 0.6510287843312375, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 34861 + }, + { + "epoch": 0.34862, + "grad_norm": 0.7122961797898909, + "learning_rate": 0.003, + "loss": 4.024, + "step": 34862 + }, + { + "epoch": 0.34863, + "grad_norm": 0.8678660886149889, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 34863 + }, + { + "epoch": 0.34864, + "grad_norm": 1.0452655633991765, + "learning_rate": 0.003, + "loss": 4.059, + "step": 34864 + }, + { + "epoch": 0.34865, + "grad_norm": 1.070535273349963, + "learning_rate": 0.003, + "loss": 4.045, + "step": 34865 + }, + { + "epoch": 0.34866, + "grad_norm": 1.0987204134562238, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 34866 + }, + { + "epoch": 0.34867, + "grad_norm": 1.0194123626621454, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 34867 + }, + { + "epoch": 0.34868, + "grad_norm": 1.1450976335033896, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 34868 + }, + { + "epoch": 0.34869, + "grad_norm": 0.9508407289361762, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 34869 + }, + { + "epoch": 0.3487, + "grad_norm": 0.8773498017567068, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 34870 + }, + { + "epoch": 0.34871, + "grad_norm": 0.7749844636857697, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 34871 + }, + { + "epoch": 0.34872, + "grad_norm": 0.7835684393940977, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 34872 + }, + { + "epoch": 0.34873, + "grad_norm": 0.8181360259539251, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 34873 + }, + { + "epoch": 0.34874, + "grad_norm": 0.8339656126350715, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 34874 + }, + { + "epoch": 0.34875, + "grad_norm": 0.9468911205620493, + "learning_rate": 0.003, + "loss": 4.041, + "step": 34875 + }, + { + "epoch": 0.34876, + "grad_norm": 0.9454886558754316, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 34876 + }, + { + "epoch": 0.34877, + "grad_norm": 0.7671692708072375, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 34877 + }, + { + "epoch": 0.34878, + "grad_norm": 0.7498095452104756, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 34878 + }, + { + "epoch": 0.34879, + "grad_norm": 0.76104444175185, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 34879 + }, + { + "epoch": 0.3488, + "grad_norm": 1.0246773668676816, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 34880 + }, + { + "epoch": 0.34881, + "grad_norm": 1.1608660473743995, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 34881 + }, + { + "epoch": 0.34882, + "grad_norm": 0.798586879776697, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 34882 + }, + { + "epoch": 0.34883, + "grad_norm": 0.8279175815924015, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 34883 + }, + { + "epoch": 0.34884, + "grad_norm": 0.8724400606069904, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 34884 + }, + { + "epoch": 0.34885, + "grad_norm": 0.8295665767374297, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 34885 + }, + { + "epoch": 0.34886, + "grad_norm": 0.7401645214132678, + "learning_rate": 0.003, + "loss": 4.04, + "step": 34886 + }, + { + "epoch": 0.34887, + "grad_norm": 0.7089590289190293, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 34887 + }, + { + "epoch": 0.34888, + "grad_norm": 0.7372603527671039, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 34888 + }, + { + "epoch": 0.34889, + "grad_norm": 0.8359724033807238, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 34889 + }, + { + "epoch": 0.3489, + "grad_norm": 1.0874551319701007, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 34890 + }, + { + "epoch": 0.34891, + "grad_norm": 0.9708819271664375, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 34891 + }, + { + "epoch": 0.34892, + "grad_norm": 0.9126952802270163, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 34892 + }, + { + "epoch": 0.34893, + "grad_norm": 0.7477364867712268, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 34893 + }, + { + "epoch": 0.34894, + "grad_norm": 0.6833933850107405, + "learning_rate": 0.003, + "loss": 4.047, + "step": 34894 + }, + { + "epoch": 0.34895, + "grad_norm": 0.7990856563361497, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 34895 + }, + { + "epoch": 0.34896, + "grad_norm": 0.7311515511974204, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 34896 + }, + { + "epoch": 0.34897, + "grad_norm": 0.7903113160565656, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 34897 + }, + { + "epoch": 0.34898, + "grad_norm": 0.8242391461252985, + "learning_rate": 0.003, + "loss": 4.041, + "step": 34898 + }, + { + "epoch": 0.34899, + "grad_norm": 0.6987967886226235, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 34899 + }, + { + "epoch": 0.349, + "grad_norm": 0.5935222691876121, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 34900 + }, + { + "epoch": 0.34901, + "grad_norm": 0.6263117706365163, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 34901 + }, + { + "epoch": 0.34902, + "grad_norm": 0.6088291114340829, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 34902 + }, + { + "epoch": 0.34903, + "grad_norm": 0.6601848700701307, + "learning_rate": 0.003, + "loss": 4.002, + "step": 34903 + }, + { + "epoch": 0.34904, + "grad_norm": 0.7829483864790328, + "learning_rate": 0.003, + "loss": 4.0019, + "step": 34904 + }, + { + "epoch": 0.34905, + "grad_norm": 1.0071179558286854, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 34905 + }, + { + "epoch": 0.34906, + "grad_norm": 1.321081572535562, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 34906 + }, + { + "epoch": 0.34907, + "grad_norm": 0.7691147844505147, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 34907 + }, + { + "epoch": 0.34908, + "grad_norm": 0.7701817677468953, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 34908 + }, + { + "epoch": 0.34909, + "grad_norm": 0.7320215391687613, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 34909 + }, + { + "epoch": 0.3491, + "grad_norm": 0.7363600511788945, + "learning_rate": 0.003, + "loss": 4.058, + "step": 34910 + }, + { + "epoch": 0.34911, + "grad_norm": 0.7328670737274582, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 34911 + }, + { + "epoch": 0.34912, + "grad_norm": 0.6882149201952159, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 34912 + }, + { + "epoch": 0.34913, + "grad_norm": 0.7130593051324376, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 34913 + }, + { + "epoch": 0.34914, + "grad_norm": 0.7283930664198441, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 34914 + }, + { + "epoch": 0.34915, + "grad_norm": 0.8252341597086985, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 34915 + }, + { + "epoch": 0.34916, + "grad_norm": 1.0672052679839612, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 34916 + }, + { + "epoch": 0.34917, + "grad_norm": 1.2547054263067807, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 34917 + }, + { + "epoch": 0.34918, + "grad_norm": 0.6214162633391549, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 34918 + }, + { + "epoch": 0.34919, + "grad_norm": 0.7899906182437352, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 34919 + }, + { + "epoch": 0.3492, + "grad_norm": 1.043777560421063, + "learning_rate": 0.003, + "loss": 4.046, + "step": 34920 + }, + { + "epoch": 0.34921, + "grad_norm": 1.0720965230653403, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 34921 + }, + { + "epoch": 0.34922, + "grad_norm": 0.8601826364418788, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 34922 + }, + { + "epoch": 0.34923, + "grad_norm": 0.9235043692964333, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 34923 + }, + { + "epoch": 0.34924, + "grad_norm": 0.9626484774285056, + "learning_rate": 0.003, + "loss": 4.049, + "step": 34924 + }, + { + "epoch": 0.34925, + "grad_norm": 0.9806554946534723, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 34925 + }, + { + "epoch": 0.34926, + "grad_norm": 1.0473652809570817, + "learning_rate": 0.003, + "loss": 4.0002, + "step": 34926 + }, + { + "epoch": 0.34927, + "grad_norm": 0.8903413268018272, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 34927 + }, + { + "epoch": 0.34928, + "grad_norm": 0.9558844563546772, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 34928 + }, + { + "epoch": 0.34929, + "grad_norm": 0.9889250032092969, + "learning_rate": 0.003, + "loss": 4.035, + "step": 34929 + }, + { + "epoch": 0.3493, + "grad_norm": 0.825405417696535, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 34930 + }, + { + "epoch": 0.34931, + "grad_norm": 0.826282040084365, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 34931 + }, + { + "epoch": 0.34932, + "grad_norm": 0.8087967877963714, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 34932 + }, + { + "epoch": 0.34933, + "grad_norm": 0.8591627044173537, + "learning_rate": 0.003, + "loss": 4.014, + "step": 34933 + }, + { + "epoch": 0.34934, + "grad_norm": 0.8110623709338638, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 34934 + }, + { + "epoch": 0.34935, + "grad_norm": 0.856467720450207, + "learning_rate": 0.003, + "loss": 4.049, + "step": 34935 + }, + { + "epoch": 0.34936, + "grad_norm": 0.8106373327325829, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 34936 + }, + { + "epoch": 0.34937, + "grad_norm": 0.8173737568040644, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 34937 + }, + { + "epoch": 0.34938, + "grad_norm": 0.8439875931995124, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 34938 + }, + { + "epoch": 0.34939, + "grad_norm": 0.908260804305364, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 34939 + }, + { + "epoch": 0.3494, + "grad_norm": 0.9759154782961252, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 34940 + }, + { + "epoch": 0.34941, + "grad_norm": 1.0036865543070173, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 34941 + }, + { + "epoch": 0.34942, + "grad_norm": 1.2358434232587203, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 34942 + }, + { + "epoch": 0.34943, + "grad_norm": 0.866674354132802, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 34943 + }, + { + "epoch": 0.34944, + "grad_norm": 0.7842726358655768, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 34944 + }, + { + "epoch": 0.34945, + "grad_norm": 0.8134469222597768, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 34945 + }, + { + "epoch": 0.34946, + "grad_norm": 0.7236051077331772, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 34946 + }, + { + "epoch": 0.34947, + "grad_norm": 0.6664438499428421, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 34947 + }, + { + "epoch": 0.34948, + "grad_norm": 0.7039469222273763, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 34948 + }, + { + "epoch": 0.34949, + "grad_norm": 0.8392810897136651, + "learning_rate": 0.003, + "loss": 4.0036, + "step": 34949 + }, + { + "epoch": 0.3495, + "grad_norm": 1.0181572281318558, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 34950 + }, + { + "epoch": 0.34951, + "grad_norm": 1.103188692030388, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 34951 + }, + { + "epoch": 0.34952, + "grad_norm": 0.723930198452184, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 34952 + }, + { + "epoch": 0.34953, + "grad_norm": 0.6737382053606537, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 34953 + }, + { + "epoch": 0.34954, + "grad_norm": 0.6419348924660938, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 34954 + }, + { + "epoch": 0.34955, + "grad_norm": 0.6441704693685999, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 34955 + }, + { + "epoch": 0.34956, + "grad_norm": 0.7385902097906977, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 34956 + }, + { + "epoch": 0.34957, + "grad_norm": 0.796753005930694, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 34957 + }, + { + "epoch": 0.34958, + "grad_norm": 0.766886692344842, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 34958 + }, + { + "epoch": 0.34959, + "grad_norm": 0.7679551285044743, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 34959 + }, + { + "epoch": 0.3496, + "grad_norm": 0.7362553846525967, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 34960 + }, + { + "epoch": 0.34961, + "grad_norm": 0.709035703445654, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 34961 + }, + { + "epoch": 0.34962, + "grad_norm": 0.6908267010907563, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 34962 + }, + { + "epoch": 0.34963, + "grad_norm": 0.7499825392255908, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 34963 + }, + { + "epoch": 0.34964, + "grad_norm": 0.6418890020898625, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 34964 + }, + { + "epoch": 0.34965, + "grad_norm": 0.6551704798621404, + "learning_rate": 0.003, + "loss": 3.9993, + "step": 34965 + }, + { + "epoch": 0.34966, + "grad_norm": 0.6788889742881529, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 34966 + }, + { + "epoch": 0.34967, + "grad_norm": 0.6532510417139951, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 34967 + }, + { + "epoch": 0.34968, + "grad_norm": 0.5852257473807752, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 34968 + }, + { + "epoch": 0.34969, + "grad_norm": 0.5820421455722822, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 34969 + }, + { + "epoch": 0.3497, + "grad_norm": 0.5476142699187124, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 34970 + }, + { + "epoch": 0.34971, + "grad_norm": 0.5752383288093383, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 34971 + }, + { + "epoch": 0.34972, + "grad_norm": 0.6847070212820777, + "learning_rate": 0.003, + "loss": 3.9992, + "step": 34972 + }, + { + "epoch": 0.34973, + "grad_norm": 0.7854392137693033, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 34973 + }, + { + "epoch": 0.34974, + "grad_norm": 1.053135344836148, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 34974 + }, + { + "epoch": 0.34975, + "grad_norm": 1.3375575311642083, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 34975 + }, + { + "epoch": 0.34976, + "grad_norm": 0.8304729160261032, + "learning_rate": 0.003, + "loss": 3.9996, + "step": 34976 + }, + { + "epoch": 0.34977, + "grad_norm": 0.7550708585476028, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 34977 + }, + { + "epoch": 0.34978, + "grad_norm": 0.7425797556362294, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 34978 + }, + { + "epoch": 0.34979, + "grad_norm": 0.954096013996938, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 34979 + }, + { + "epoch": 0.3498, + "grad_norm": 1.0912874154665941, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 34980 + }, + { + "epoch": 0.34981, + "grad_norm": 0.9734549742054219, + "learning_rate": 0.003, + "loss": 4.036, + "step": 34981 + }, + { + "epoch": 0.34982, + "grad_norm": 1.024006641988347, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 34982 + }, + { + "epoch": 0.34983, + "grad_norm": 0.9228472223504327, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 34983 + }, + { + "epoch": 0.34984, + "grad_norm": 0.9363402346440106, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 34984 + }, + { + "epoch": 0.34985, + "grad_norm": 1.069076097469853, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 34985 + }, + { + "epoch": 0.34986, + "grad_norm": 1.0387048086800195, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 34986 + }, + { + "epoch": 0.34987, + "grad_norm": 1.0044865544482706, + "learning_rate": 0.003, + "loss": 4.019, + "step": 34987 + }, + { + "epoch": 0.34988, + "grad_norm": 1.2686727085051517, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 34988 + }, + { + "epoch": 0.34989, + "grad_norm": 0.9355598162575778, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 34989 + }, + { + "epoch": 0.3499, + "grad_norm": 0.8710304163430269, + "learning_rate": 0.003, + "loss": 4.079, + "step": 34990 + }, + { + "epoch": 0.34991, + "grad_norm": 0.8257121968182667, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 34991 + }, + { + "epoch": 0.34992, + "grad_norm": 0.8396003288220003, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 34992 + }, + { + "epoch": 0.34993, + "grad_norm": 0.8349132604862769, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 34993 + }, + { + "epoch": 0.34994, + "grad_norm": 0.9069760332043583, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 34994 + }, + { + "epoch": 0.34995, + "grad_norm": 0.8696612831725559, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 34995 + }, + { + "epoch": 0.34996, + "grad_norm": 0.9211346313192956, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 34996 + }, + { + "epoch": 0.34997, + "grad_norm": 0.99567284058509, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 34997 + }, + { + "epoch": 0.34998, + "grad_norm": 1.1283076331874475, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 34998 + }, + { + "epoch": 0.34999, + "grad_norm": 0.8725202183638248, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 34999 + }, + { + "epoch": 0.35, + "grad_norm": 0.7830043897907385, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 35000 + }, + { + "epoch": 0.35001, + "grad_norm": 0.7517297792424326, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 35001 + }, + { + "epoch": 0.35002, + "grad_norm": 0.7792110701093973, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 35002 + }, + { + "epoch": 0.35003, + "grad_norm": 0.7301755495280043, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 35003 + }, + { + "epoch": 0.35004, + "grad_norm": 0.7814715615626235, + "learning_rate": 0.003, + "loss": 4.047, + "step": 35004 + }, + { + "epoch": 0.35005, + "grad_norm": 0.9511098641968765, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 35005 + }, + { + "epoch": 0.35006, + "grad_norm": 1.1690917819110918, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 35006 + }, + { + "epoch": 0.35007, + "grad_norm": 0.9408308480293311, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 35007 + }, + { + "epoch": 0.35008, + "grad_norm": 1.037018529443783, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 35008 + }, + { + "epoch": 0.35009, + "grad_norm": 0.8957172442389573, + "learning_rate": 0.003, + "loss": 4.018, + "step": 35009 + }, + { + "epoch": 0.3501, + "grad_norm": 0.8415008616100952, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 35010 + }, + { + "epoch": 0.35011, + "grad_norm": 0.7150578389441221, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 35011 + }, + { + "epoch": 0.35012, + "grad_norm": 0.6340238671011771, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 35012 + }, + { + "epoch": 0.35013, + "grad_norm": 0.6287837210175218, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 35013 + }, + { + "epoch": 0.35014, + "grad_norm": 0.7477959421676172, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 35014 + }, + { + "epoch": 0.35015, + "grad_norm": 0.6712303300886862, + "learning_rate": 0.003, + "loss": 4.0035, + "step": 35015 + }, + { + "epoch": 0.35016, + "grad_norm": 0.6518584338173726, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 35016 + }, + { + "epoch": 0.35017, + "grad_norm": 0.632700754663963, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 35017 + }, + { + "epoch": 0.35018, + "grad_norm": 0.6495640951944327, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 35018 + }, + { + "epoch": 0.35019, + "grad_norm": 0.7376806819845483, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 35019 + }, + { + "epoch": 0.3502, + "grad_norm": 0.8201940541838998, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 35020 + }, + { + "epoch": 0.35021, + "grad_norm": 0.8378929810724172, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 35021 + }, + { + "epoch": 0.35022, + "grad_norm": 0.8383607931304853, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 35022 + }, + { + "epoch": 0.35023, + "grad_norm": 0.8889798925536566, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 35023 + }, + { + "epoch": 0.35024, + "grad_norm": 0.8105888107757856, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 35024 + }, + { + "epoch": 0.35025, + "grad_norm": 0.7387957090083844, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 35025 + }, + { + "epoch": 0.35026, + "grad_norm": 0.8002618347208682, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 35026 + }, + { + "epoch": 0.35027, + "grad_norm": 0.9158858928219336, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 35027 + }, + { + "epoch": 0.35028, + "grad_norm": 0.9733923708503148, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 35028 + }, + { + "epoch": 0.35029, + "grad_norm": 1.08049571183365, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 35029 + }, + { + "epoch": 0.3503, + "grad_norm": 0.8776048078994035, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 35030 + }, + { + "epoch": 0.35031, + "grad_norm": 0.6797855022436746, + "learning_rate": 0.003, + "loss": 4.021, + "step": 35031 + }, + { + "epoch": 0.35032, + "grad_norm": 0.698300708642211, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 35032 + }, + { + "epoch": 0.35033, + "grad_norm": 0.7624607982630223, + "learning_rate": 0.003, + "loss": 4.02, + "step": 35033 + }, + { + "epoch": 0.35034, + "grad_norm": 0.7375379510344108, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 35034 + }, + { + "epoch": 0.35035, + "grad_norm": 0.7304271560357061, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 35035 + }, + { + "epoch": 0.35036, + "grad_norm": 0.6533157696109166, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 35036 + }, + { + "epoch": 0.35037, + "grad_norm": 0.6090612250027095, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 35037 + }, + { + "epoch": 0.35038, + "grad_norm": 0.5466583554433534, + "learning_rate": 0.003, + "loss": 4.003, + "step": 35038 + }, + { + "epoch": 0.35039, + "grad_norm": 0.6034909368229479, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 35039 + }, + { + "epoch": 0.3504, + "grad_norm": 0.6260186228290786, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 35040 + }, + { + "epoch": 0.35041, + "grad_norm": 0.6275060184941971, + "learning_rate": 0.003, + "loss": 3.9833, + "step": 35041 + }, + { + "epoch": 0.35042, + "grad_norm": 0.6579380967251274, + "learning_rate": 0.003, + "loss": 4.031, + "step": 35042 + }, + { + "epoch": 0.35043, + "grad_norm": 0.7684750543074379, + "learning_rate": 0.003, + "loss": 4.017, + "step": 35043 + }, + { + "epoch": 0.35044, + "grad_norm": 1.0964682454624022, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 35044 + }, + { + "epoch": 0.35045, + "grad_norm": 1.2887835171915043, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 35045 + }, + { + "epoch": 0.35046, + "grad_norm": 0.6590118362285846, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 35046 + }, + { + "epoch": 0.35047, + "grad_norm": 0.6883438285716045, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 35047 + }, + { + "epoch": 0.35048, + "grad_norm": 0.685641787428032, + "learning_rate": 0.003, + "loss": 4.031, + "step": 35048 + }, + { + "epoch": 0.35049, + "grad_norm": 0.7409801216819885, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 35049 + }, + { + "epoch": 0.3505, + "grad_norm": 0.6530823937737744, + "learning_rate": 0.003, + "loss": 4.0028, + "step": 35050 + }, + { + "epoch": 0.35051, + "grad_norm": 0.6887583747543514, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 35051 + }, + { + "epoch": 0.35052, + "grad_norm": 0.7730769030325064, + "learning_rate": 0.003, + "loss": 4.014, + "step": 35052 + }, + { + "epoch": 0.35053, + "grad_norm": 0.9486658117690205, + "learning_rate": 0.003, + "loss": 4.027, + "step": 35053 + }, + { + "epoch": 0.35054, + "grad_norm": 1.1602073327777291, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 35054 + }, + { + "epoch": 0.35055, + "grad_norm": 1.0764205889732579, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 35055 + }, + { + "epoch": 0.35056, + "grad_norm": 1.0017567716364841, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 35056 + }, + { + "epoch": 0.35057, + "grad_norm": 1.0197284361419308, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 35057 + }, + { + "epoch": 0.35058, + "grad_norm": 1.0881178406090495, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 35058 + }, + { + "epoch": 0.35059, + "grad_norm": 0.7943516792366818, + "learning_rate": 0.003, + "loss": 4.0066, + "step": 35059 + }, + { + "epoch": 0.3506, + "grad_norm": 0.7850535021459475, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 35060 + }, + { + "epoch": 0.35061, + "grad_norm": 0.7595985981427459, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 35061 + }, + { + "epoch": 0.35062, + "grad_norm": 0.8174397123323409, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 35062 + }, + { + "epoch": 0.35063, + "grad_norm": 0.8187734647871031, + "learning_rate": 0.003, + "loss": 4.045, + "step": 35063 + }, + { + "epoch": 0.35064, + "grad_norm": 0.8107168183419964, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 35064 + }, + { + "epoch": 0.35065, + "grad_norm": 1.0866783693661772, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 35065 + }, + { + "epoch": 0.35066, + "grad_norm": 1.0593496247202612, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 35066 + }, + { + "epoch": 0.35067, + "grad_norm": 0.8664509996795793, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 35067 + }, + { + "epoch": 0.35068, + "grad_norm": 0.8209897301815572, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 35068 + }, + { + "epoch": 0.35069, + "grad_norm": 0.7966027776052522, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 35069 + }, + { + "epoch": 0.3507, + "grad_norm": 0.7688643249370594, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 35070 + }, + { + "epoch": 0.35071, + "grad_norm": 0.8690179250804031, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 35071 + }, + { + "epoch": 0.35072, + "grad_norm": 0.9819582714983514, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 35072 + }, + { + "epoch": 0.35073, + "grad_norm": 1.2962567416010777, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 35073 + }, + { + "epoch": 0.35074, + "grad_norm": 0.8910626013633564, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 35074 + }, + { + "epoch": 0.35075, + "grad_norm": 1.000108646013681, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 35075 + }, + { + "epoch": 0.35076, + "grad_norm": 1.1141919912715788, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 35076 + }, + { + "epoch": 0.35077, + "grad_norm": 0.900985244109463, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 35077 + }, + { + "epoch": 0.35078, + "grad_norm": 0.8776270982778284, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 35078 + }, + { + "epoch": 0.35079, + "grad_norm": 0.8773465509027232, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 35079 + }, + { + "epoch": 0.3508, + "grad_norm": 0.9806824485119736, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 35080 + }, + { + "epoch": 0.35081, + "grad_norm": 0.9835573241960879, + "learning_rate": 0.003, + "loss": 4.053, + "step": 35081 + }, + { + "epoch": 0.35082, + "grad_norm": 0.828644375756164, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 35082 + }, + { + "epoch": 0.35083, + "grad_norm": 0.758955707960909, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 35083 + }, + { + "epoch": 0.35084, + "grad_norm": 0.8164085080549616, + "learning_rate": 0.003, + "loss": 4.029, + "step": 35084 + }, + { + "epoch": 0.35085, + "grad_norm": 0.7387917598679636, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 35085 + }, + { + "epoch": 0.35086, + "grad_norm": 0.6774982655073124, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 35086 + }, + { + "epoch": 0.35087, + "grad_norm": 0.6800619600513751, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 35087 + }, + { + "epoch": 0.35088, + "grad_norm": 0.7633018001080065, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 35088 + }, + { + "epoch": 0.35089, + "grad_norm": 0.9152842223309234, + "learning_rate": 0.003, + "loss": 4.0, + "step": 35089 + }, + { + "epoch": 0.3509, + "grad_norm": 0.9418514742602236, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 35090 + }, + { + "epoch": 0.35091, + "grad_norm": 0.9272776545326409, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 35091 + }, + { + "epoch": 0.35092, + "grad_norm": 0.8386504573892916, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 35092 + }, + { + "epoch": 0.35093, + "grad_norm": 0.8622652916646232, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 35093 + }, + { + "epoch": 0.35094, + "grad_norm": 0.7951835212803332, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 35094 + }, + { + "epoch": 0.35095, + "grad_norm": 0.7208987612282648, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 35095 + }, + { + "epoch": 0.35096, + "grad_norm": 0.6909268608991546, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 35096 + }, + { + "epoch": 0.35097, + "grad_norm": 0.6769278636746499, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 35097 + }, + { + "epoch": 0.35098, + "grad_norm": 0.7222917515626588, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 35098 + }, + { + "epoch": 0.35099, + "grad_norm": 0.750258209282132, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 35099 + }, + { + "epoch": 0.351, + "grad_norm": 0.7777574255951669, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 35100 + }, + { + "epoch": 0.35101, + "grad_norm": 0.7516913644886785, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 35101 + }, + { + "epoch": 0.35102, + "grad_norm": 0.8272808931777358, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 35102 + }, + { + "epoch": 0.35103, + "grad_norm": 0.9777459626413545, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 35103 + }, + { + "epoch": 0.35104, + "grad_norm": 1.0834813747047427, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 35104 + }, + { + "epoch": 0.35105, + "grad_norm": 1.1217254029542798, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 35105 + }, + { + "epoch": 0.35106, + "grad_norm": 1.0797312968152044, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 35106 + }, + { + "epoch": 0.35107, + "grad_norm": 0.9130053465551803, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 35107 + }, + { + "epoch": 0.35108, + "grad_norm": 0.8438150857106466, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 35108 + }, + { + "epoch": 0.35109, + "grad_norm": 0.8121404063186625, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 35109 + }, + { + "epoch": 0.3511, + "grad_norm": 0.8211915357299913, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 35110 + }, + { + "epoch": 0.35111, + "grad_norm": 0.9434591928372962, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 35111 + }, + { + "epoch": 0.35112, + "grad_norm": 1.025832081480744, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 35112 + }, + { + "epoch": 0.35113, + "grad_norm": 1.029549871123057, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 35113 + }, + { + "epoch": 0.35114, + "grad_norm": 0.982914146580783, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 35114 + }, + { + "epoch": 0.35115, + "grad_norm": 1.027941043374991, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 35115 + }, + { + "epoch": 0.35116, + "grad_norm": 0.9094824497739847, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 35116 + }, + { + "epoch": 0.35117, + "grad_norm": 0.8413092581113586, + "learning_rate": 0.003, + "loss": 4.034, + "step": 35117 + }, + { + "epoch": 0.35118, + "grad_norm": 0.8298983664439524, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 35118 + }, + { + "epoch": 0.35119, + "grad_norm": 0.8381640324118718, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 35119 + }, + { + "epoch": 0.3512, + "grad_norm": 0.8486525020668845, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 35120 + }, + { + "epoch": 0.35121, + "grad_norm": 0.7631173268234093, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 35121 + }, + { + "epoch": 0.35122, + "grad_norm": 0.6620480132590333, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 35122 + }, + { + "epoch": 0.35123, + "grad_norm": 0.6955631997999159, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 35123 + }, + { + "epoch": 0.35124, + "grad_norm": 0.7301466909232661, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 35124 + }, + { + "epoch": 0.35125, + "grad_norm": 0.794837422485452, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 35125 + }, + { + "epoch": 0.35126, + "grad_norm": 0.8487164461627894, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 35126 + }, + { + "epoch": 0.35127, + "grad_norm": 0.8713276034477562, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 35127 + }, + { + "epoch": 0.35128, + "grad_norm": 0.8857008667795788, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 35128 + }, + { + "epoch": 0.35129, + "grad_norm": 0.9332736617121964, + "learning_rate": 0.003, + "loss": 4.047, + "step": 35129 + }, + { + "epoch": 0.3513, + "grad_norm": 0.8976997090514381, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 35130 + }, + { + "epoch": 0.35131, + "grad_norm": 0.8081145981076185, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 35131 + }, + { + "epoch": 0.35132, + "grad_norm": 0.8129911540637825, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 35132 + }, + { + "epoch": 0.35133, + "grad_norm": 0.7141832848125637, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 35133 + }, + { + "epoch": 0.35134, + "grad_norm": 0.6915840824342027, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 35134 + }, + { + "epoch": 0.35135, + "grad_norm": 0.6257224736512339, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 35135 + }, + { + "epoch": 0.35136, + "grad_norm": 0.6316318545969387, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 35136 + }, + { + "epoch": 0.35137, + "grad_norm": 0.7468413258774057, + "learning_rate": 0.003, + "loss": 3.9997, + "step": 35137 + }, + { + "epoch": 0.35138, + "grad_norm": 0.8461041028950845, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 35138 + }, + { + "epoch": 0.35139, + "grad_norm": 0.772822011711232, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 35139 + }, + { + "epoch": 0.3514, + "grad_norm": 0.7261199507806635, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 35140 + }, + { + "epoch": 0.35141, + "grad_norm": 0.7755111946368917, + "learning_rate": 0.003, + "loss": 4.03, + "step": 35141 + }, + { + "epoch": 0.35142, + "grad_norm": 0.7928812761470443, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 35142 + }, + { + "epoch": 0.35143, + "grad_norm": 0.7675499568856633, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 35143 + }, + { + "epoch": 0.35144, + "grad_norm": 0.7018499770636035, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 35144 + }, + { + "epoch": 0.35145, + "grad_norm": 0.632219657158281, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 35145 + }, + { + "epoch": 0.35146, + "grad_norm": 0.6519725358901763, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 35146 + }, + { + "epoch": 0.35147, + "grad_norm": 0.5824291776649382, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 35147 + }, + { + "epoch": 0.35148, + "grad_norm": 0.5024329999420618, + "learning_rate": 0.003, + "loss": 4.0011, + "step": 35148 + }, + { + "epoch": 0.35149, + "grad_norm": 0.5000654808166667, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 35149 + }, + { + "epoch": 0.3515, + "grad_norm": 0.49174102538565756, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 35150 + }, + { + "epoch": 0.35151, + "grad_norm": 0.5428347939873523, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 35151 + }, + { + "epoch": 0.35152, + "grad_norm": 0.6916883410612155, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 35152 + }, + { + "epoch": 0.35153, + "grad_norm": 0.9568346513416838, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 35153 + }, + { + "epoch": 0.35154, + "grad_norm": 1.4107748345896118, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 35154 + }, + { + "epoch": 0.35155, + "grad_norm": 0.8261708747967849, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 35155 + }, + { + "epoch": 0.35156, + "grad_norm": 0.9293042620697722, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 35156 + }, + { + "epoch": 0.35157, + "grad_norm": 0.9172823753215151, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 35157 + }, + { + "epoch": 0.35158, + "grad_norm": 0.9156288865616014, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 35158 + }, + { + "epoch": 0.35159, + "grad_norm": 1.0510766638757187, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 35159 + }, + { + "epoch": 0.3516, + "grad_norm": 1.042796026095851, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 35160 + }, + { + "epoch": 0.35161, + "grad_norm": 0.9482540605681067, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 35161 + }, + { + "epoch": 0.35162, + "grad_norm": 1.0875008875195806, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 35162 + }, + { + "epoch": 0.35163, + "grad_norm": 0.9090039867269806, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 35163 + }, + { + "epoch": 0.35164, + "grad_norm": 0.9542706349271162, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 35164 + }, + { + "epoch": 0.35165, + "grad_norm": 0.9428979490833859, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 35165 + }, + { + "epoch": 0.35166, + "grad_norm": 0.7806272058367609, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 35166 + }, + { + "epoch": 0.35167, + "grad_norm": 0.9624486171473935, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 35167 + }, + { + "epoch": 0.35168, + "grad_norm": 1.1690748237112876, + "learning_rate": 0.003, + "loss": 4.023, + "step": 35168 + }, + { + "epoch": 0.35169, + "grad_norm": 0.8959193343238345, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 35169 + }, + { + "epoch": 0.3517, + "grad_norm": 0.8881199689791797, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 35170 + }, + { + "epoch": 0.35171, + "grad_norm": 0.9819844887597202, + "learning_rate": 0.003, + "loss": 4.037, + "step": 35171 + }, + { + "epoch": 0.35172, + "grad_norm": 1.0116040716956964, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 35172 + }, + { + "epoch": 0.35173, + "grad_norm": 1.0047978538638624, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 35173 + }, + { + "epoch": 0.35174, + "grad_norm": 1.0293123402739492, + "learning_rate": 0.003, + "loss": 4.062, + "step": 35174 + }, + { + "epoch": 0.35175, + "grad_norm": 0.9532082773177747, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 35175 + }, + { + "epoch": 0.35176, + "grad_norm": 0.9746004059549227, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 35176 + }, + { + "epoch": 0.35177, + "grad_norm": 1.1478827556370528, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 35177 + }, + { + "epoch": 0.35178, + "grad_norm": 0.7804840027115564, + "learning_rate": 0.003, + "loss": 4.055, + "step": 35178 + }, + { + "epoch": 0.35179, + "grad_norm": 0.6885827881226108, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 35179 + }, + { + "epoch": 0.3518, + "grad_norm": 0.7504629046010015, + "learning_rate": 0.003, + "loss": 3.9984, + "step": 35180 + }, + { + "epoch": 0.35181, + "grad_norm": 0.8949395930259307, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 35181 + }, + { + "epoch": 0.35182, + "grad_norm": 0.9167871893743135, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 35182 + }, + { + "epoch": 0.35183, + "grad_norm": 0.9746095546970701, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 35183 + }, + { + "epoch": 0.35184, + "grad_norm": 0.9171353972430094, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 35184 + }, + { + "epoch": 0.35185, + "grad_norm": 0.8173292816387988, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 35185 + }, + { + "epoch": 0.35186, + "grad_norm": 0.7918938065462896, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 35186 + }, + { + "epoch": 0.35187, + "grad_norm": 0.9232843381696075, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 35187 + }, + { + "epoch": 0.35188, + "grad_norm": 0.9725765778081803, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 35188 + }, + { + "epoch": 0.35189, + "grad_norm": 0.9149842898974258, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 35189 + }, + { + "epoch": 0.3519, + "grad_norm": 0.7346880479937125, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 35190 + }, + { + "epoch": 0.35191, + "grad_norm": 0.7354618072173863, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 35191 + }, + { + "epoch": 0.35192, + "grad_norm": 0.7629810862549169, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 35192 + }, + { + "epoch": 0.35193, + "grad_norm": 0.7084090875544662, + "learning_rate": 0.003, + "loss": 4.061, + "step": 35193 + }, + { + "epoch": 0.35194, + "grad_norm": 0.5936256355618695, + "learning_rate": 0.003, + "loss": 4.041, + "step": 35194 + }, + { + "epoch": 0.35195, + "grad_norm": 0.5188344716348993, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 35195 + }, + { + "epoch": 0.35196, + "grad_norm": 0.4958534130197045, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 35196 + }, + { + "epoch": 0.35197, + "grad_norm": 0.5183729214185196, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 35197 + }, + { + "epoch": 0.35198, + "grad_norm": 0.6085589486641731, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 35198 + }, + { + "epoch": 0.35199, + "grad_norm": 0.6873093747896977, + "learning_rate": 0.003, + "loss": 4.036, + "step": 35199 + }, + { + "epoch": 0.352, + "grad_norm": 0.7017582184330183, + "learning_rate": 0.003, + "loss": 3.9897, + "step": 35200 + }, + { + "epoch": 0.35201, + "grad_norm": 0.6940647347600101, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 35201 + }, + { + "epoch": 0.35202, + "grad_norm": 0.690740739673416, + "learning_rate": 0.003, + "loss": 4.0073, + "step": 35202 + }, + { + "epoch": 0.35203, + "grad_norm": 0.8198806429153129, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 35203 + }, + { + "epoch": 0.35204, + "grad_norm": 0.9105584722622847, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 35204 + }, + { + "epoch": 0.35205, + "grad_norm": 0.9707130283968068, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 35205 + }, + { + "epoch": 0.35206, + "grad_norm": 1.003768943623242, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 35206 + }, + { + "epoch": 0.35207, + "grad_norm": 1.0477587565920572, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 35207 + }, + { + "epoch": 0.35208, + "grad_norm": 0.9233216031467486, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 35208 + }, + { + "epoch": 0.35209, + "grad_norm": 0.8374310343632653, + "learning_rate": 0.003, + "loss": 4.046, + "step": 35209 + }, + { + "epoch": 0.3521, + "grad_norm": 0.8832694738029971, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 35210 + }, + { + "epoch": 0.35211, + "grad_norm": 0.9161110887331092, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 35211 + }, + { + "epoch": 0.35212, + "grad_norm": 0.9890461681376855, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 35212 + }, + { + "epoch": 0.35213, + "grad_norm": 1.0634445456620187, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 35213 + }, + { + "epoch": 0.35214, + "grad_norm": 1.109885219426023, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 35214 + }, + { + "epoch": 0.35215, + "grad_norm": 0.8927602059332673, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 35215 + }, + { + "epoch": 0.35216, + "grad_norm": 0.8657659440127494, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 35216 + }, + { + "epoch": 0.35217, + "grad_norm": 0.847471499035758, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 35217 + }, + { + "epoch": 0.35218, + "grad_norm": 0.8605124597892159, + "learning_rate": 0.003, + "loss": 4.012, + "step": 35218 + }, + { + "epoch": 0.35219, + "grad_norm": 0.9073893163124725, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 35219 + }, + { + "epoch": 0.3522, + "grad_norm": 1.0050593060388826, + "learning_rate": 0.003, + "loss": 4.036, + "step": 35220 + }, + { + "epoch": 0.35221, + "grad_norm": 1.153010504498366, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 35221 + }, + { + "epoch": 0.35222, + "grad_norm": 0.8406918688088859, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 35222 + }, + { + "epoch": 0.35223, + "grad_norm": 0.7636479044122304, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 35223 + }, + { + "epoch": 0.35224, + "grad_norm": 0.6711598585789031, + "learning_rate": 0.003, + "loss": 4.04, + "step": 35224 + }, + { + "epoch": 0.35225, + "grad_norm": 0.6965400809562491, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 35225 + }, + { + "epoch": 0.35226, + "grad_norm": 0.6952603549848736, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 35226 + }, + { + "epoch": 0.35227, + "grad_norm": 0.7726354856016462, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 35227 + }, + { + "epoch": 0.35228, + "grad_norm": 0.7705632516058647, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 35228 + }, + { + "epoch": 0.35229, + "grad_norm": 0.7255883960314718, + "learning_rate": 0.003, + "loss": 4.031, + "step": 35229 + }, + { + "epoch": 0.3523, + "grad_norm": 0.674024550072049, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 35230 + }, + { + "epoch": 0.35231, + "grad_norm": 0.6703878154998932, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 35231 + }, + { + "epoch": 0.35232, + "grad_norm": 0.7727737394254651, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 35232 + }, + { + "epoch": 0.35233, + "grad_norm": 0.9421880346423944, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 35233 + }, + { + "epoch": 0.35234, + "grad_norm": 1.029642520496911, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 35234 + }, + { + "epoch": 0.35235, + "grad_norm": 0.9835474427954992, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 35235 + }, + { + "epoch": 0.35236, + "grad_norm": 1.0152696852541423, + "learning_rate": 0.003, + "loss": 4.026, + "step": 35236 + }, + { + "epoch": 0.35237, + "grad_norm": 0.9668427300175659, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 35237 + }, + { + "epoch": 0.35238, + "grad_norm": 0.9509871512851996, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 35238 + }, + { + "epoch": 0.35239, + "grad_norm": 1.026365856446077, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 35239 + }, + { + "epoch": 0.3524, + "grad_norm": 0.9773777733028364, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 35240 + }, + { + "epoch": 0.35241, + "grad_norm": 0.879535097880324, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 35241 + }, + { + "epoch": 0.35242, + "grad_norm": 0.713640638616957, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 35242 + }, + { + "epoch": 0.35243, + "grad_norm": 0.7524319737468896, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 35243 + }, + { + "epoch": 0.35244, + "grad_norm": 0.8643963325644647, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 35244 + }, + { + "epoch": 0.35245, + "grad_norm": 0.8381654620170323, + "learning_rate": 0.003, + "loss": 4.071, + "step": 35245 + }, + { + "epoch": 0.35246, + "grad_norm": 0.8452606839376504, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 35246 + }, + { + "epoch": 0.35247, + "grad_norm": 0.6327150633186418, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 35247 + }, + { + "epoch": 0.35248, + "grad_norm": 0.5804175720552595, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 35248 + }, + { + "epoch": 0.35249, + "grad_norm": 0.5289644308204289, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 35249 + }, + { + "epoch": 0.3525, + "grad_norm": 0.5619816666662856, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 35250 + }, + { + "epoch": 0.35251, + "grad_norm": 0.6180242065111174, + "learning_rate": 0.003, + "loss": 4.0068, + "step": 35251 + }, + { + "epoch": 0.35252, + "grad_norm": 0.7450196800734594, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 35252 + }, + { + "epoch": 0.35253, + "grad_norm": 0.9477313461212996, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 35253 + }, + { + "epoch": 0.35254, + "grad_norm": 1.1819879939281142, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 35254 + }, + { + "epoch": 0.35255, + "grad_norm": 0.6607777513171338, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 35255 + }, + { + "epoch": 0.35256, + "grad_norm": 0.5182338901969771, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 35256 + }, + { + "epoch": 0.35257, + "grad_norm": 0.7826713776692046, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 35257 + }, + { + "epoch": 0.35258, + "grad_norm": 0.9464866036150354, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 35258 + }, + { + "epoch": 0.35259, + "grad_norm": 0.9698199904881265, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 35259 + }, + { + "epoch": 0.3526, + "grad_norm": 0.953806695998152, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 35260 + }, + { + "epoch": 0.35261, + "grad_norm": 1.0306115448608555, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 35261 + }, + { + "epoch": 0.35262, + "grad_norm": 1.0117067658906804, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 35262 + }, + { + "epoch": 0.35263, + "grad_norm": 1.0153326214395861, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 35263 + }, + { + "epoch": 0.35264, + "grad_norm": 0.9912024881233239, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 35264 + }, + { + "epoch": 0.35265, + "grad_norm": 1.0083962864751475, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 35265 + }, + { + "epoch": 0.35266, + "grad_norm": 1.1541331472113723, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 35266 + }, + { + "epoch": 0.35267, + "grad_norm": 0.9140923766461305, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 35267 + }, + { + "epoch": 0.35268, + "grad_norm": 0.7486582971515309, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 35268 + }, + { + "epoch": 0.35269, + "grad_norm": 0.6090931035653594, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 35269 + }, + { + "epoch": 0.3527, + "grad_norm": 0.6005395353085234, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 35270 + }, + { + "epoch": 0.35271, + "grad_norm": 0.7089725678297071, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 35271 + }, + { + "epoch": 0.35272, + "grad_norm": 0.6582917452391771, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 35272 + }, + { + "epoch": 0.35273, + "grad_norm": 0.6297966682698514, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 35273 + }, + { + "epoch": 0.35274, + "grad_norm": 0.7195598883254993, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 35274 + }, + { + "epoch": 0.35275, + "grad_norm": 0.8399066323297114, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 35275 + }, + { + "epoch": 0.35276, + "grad_norm": 0.7940913993773373, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 35276 + }, + { + "epoch": 0.35277, + "grad_norm": 0.660348767931516, + "learning_rate": 0.003, + "loss": 4.0006, + "step": 35277 + }, + { + "epoch": 0.35278, + "grad_norm": 0.5605881054341464, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 35278 + }, + { + "epoch": 0.35279, + "grad_norm": 0.5878417607232299, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 35279 + }, + { + "epoch": 0.3528, + "grad_norm": 0.7528141937469021, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 35280 + }, + { + "epoch": 0.35281, + "grad_norm": 0.8652571896198665, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 35281 + }, + { + "epoch": 0.35282, + "grad_norm": 0.9390485229867606, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 35282 + }, + { + "epoch": 0.35283, + "grad_norm": 1.0761150772442634, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 35283 + }, + { + "epoch": 0.35284, + "grad_norm": 0.9711986991788634, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 35284 + }, + { + "epoch": 0.35285, + "grad_norm": 0.9350176493550356, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 35285 + }, + { + "epoch": 0.35286, + "grad_norm": 0.8946115564270194, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 35286 + }, + { + "epoch": 0.35287, + "grad_norm": 0.7554610157216101, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 35287 + }, + { + "epoch": 0.35288, + "grad_norm": 0.7125452595360842, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 35288 + }, + { + "epoch": 0.35289, + "grad_norm": 0.5689732281880509, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 35289 + }, + { + "epoch": 0.3529, + "grad_norm": 0.5408928747346811, + "learning_rate": 0.003, + "loss": 4.021, + "step": 35290 + }, + { + "epoch": 0.35291, + "grad_norm": 0.6068706962545, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 35291 + }, + { + "epoch": 0.35292, + "grad_norm": 0.7454729650156972, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 35292 + }, + { + "epoch": 0.35293, + "grad_norm": 0.9724543930770964, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 35293 + }, + { + "epoch": 0.35294, + "grad_norm": 1.232415292237379, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 35294 + }, + { + "epoch": 0.35295, + "grad_norm": 0.7685965333663676, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 35295 + }, + { + "epoch": 0.35296, + "grad_norm": 0.6873593666600332, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 35296 + }, + { + "epoch": 0.35297, + "grad_norm": 0.6782988605592402, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 35297 + }, + { + "epoch": 0.35298, + "grad_norm": 0.7074295438460946, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 35298 + }, + { + "epoch": 0.35299, + "grad_norm": 0.7315444233805894, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 35299 + }, + { + "epoch": 0.353, + "grad_norm": 0.7762064958766007, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 35300 + }, + { + "epoch": 0.35301, + "grad_norm": 0.8224294025287957, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 35301 + }, + { + "epoch": 0.35302, + "grad_norm": 0.9688558956074059, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 35302 + }, + { + "epoch": 0.35303, + "grad_norm": 1.0780247342951372, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 35303 + }, + { + "epoch": 0.35304, + "grad_norm": 0.8711172263839682, + "learning_rate": 0.003, + "loss": 4.025, + "step": 35304 + }, + { + "epoch": 0.35305, + "grad_norm": 0.8078287486399488, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 35305 + }, + { + "epoch": 0.35306, + "grad_norm": 0.7396013986913598, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 35306 + }, + { + "epoch": 0.35307, + "grad_norm": 0.6686000744893293, + "learning_rate": 0.003, + "loss": 4.054, + "step": 35307 + }, + { + "epoch": 0.35308, + "grad_norm": 0.7225306264438495, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 35308 + }, + { + "epoch": 0.35309, + "grad_norm": 0.6450555661790196, + "learning_rate": 0.003, + "loss": 3.9985, + "step": 35309 + }, + { + "epoch": 0.3531, + "grad_norm": 0.6563703282321344, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 35310 + }, + { + "epoch": 0.35311, + "grad_norm": 0.7057884901969602, + "learning_rate": 0.003, + "loss": 4.031, + "step": 35311 + }, + { + "epoch": 0.35312, + "grad_norm": 0.8101023891292665, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 35312 + }, + { + "epoch": 0.35313, + "grad_norm": 1.0249610509011964, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 35313 + }, + { + "epoch": 0.35314, + "grad_norm": 1.133988599462333, + "learning_rate": 0.003, + "loss": 3.9984, + "step": 35314 + }, + { + "epoch": 0.35315, + "grad_norm": 0.9100342553308453, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 35315 + }, + { + "epoch": 0.35316, + "grad_norm": 0.8690460268490119, + "learning_rate": 0.003, + "loss": 3.9925, + "step": 35316 + }, + { + "epoch": 0.35317, + "grad_norm": 0.861741682447213, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 35317 + }, + { + "epoch": 0.35318, + "grad_norm": 1.0478812593091773, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 35318 + }, + { + "epoch": 0.35319, + "grad_norm": 0.9145304262181804, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 35319 + }, + { + "epoch": 0.3532, + "grad_norm": 0.8776495542421046, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 35320 + }, + { + "epoch": 0.35321, + "grad_norm": 0.836206451867468, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 35321 + }, + { + "epoch": 0.35322, + "grad_norm": 0.8026880283593412, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 35322 + }, + { + "epoch": 0.35323, + "grad_norm": 0.8403358542862727, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 35323 + }, + { + "epoch": 0.35324, + "grad_norm": 0.8755099906895667, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 35324 + }, + { + "epoch": 0.35325, + "grad_norm": 0.9190745476207055, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 35325 + }, + { + "epoch": 0.35326, + "grad_norm": 0.8350433822593392, + "learning_rate": 0.003, + "loss": 4.0076, + "step": 35326 + }, + { + "epoch": 0.35327, + "grad_norm": 0.7379402144776968, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 35327 + }, + { + "epoch": 0.35328, + "grad_norm": 0.7169295253046076, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 35328 + }, + { + "epoch": 0.35329, + "grad_norm": 0.8472143100561214, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 35329 + }, + { + "epoch": 0.3533, + "grad_norm": 0.8797433394605029, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 35330 + }, + { + "epoch": 0.35331, + "grad_norm": 0.7804285090975526, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 35331 + }, + { + "epoch": 0.35332, + "grad_norm": 0.7806297996145116, + "learning_rate": 0.003, + "loss": 4.058, + "step": 35332 + }, + { + "epoch": 0.35333, + "grad_norm": 0.8798346598999109, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 35333 + }, + { + "epoch": 0.35334, + "grad_norm": 0.9706168380966738, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 35334 + }, + { + "epoch": 0.35335, + "grad_norm": 1.0347084731161396, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 35335 + }, + { + "epoch": 0.35336, + "grad_norm": 0.8972123458236516, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 35336 + }, + { + "epoch": 0.35337, + "grad_norm": 0.864444232949959, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 35337 + }, + { + "epoch": 0.35338, + "grad_norm": 0.8705508409423729, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 35338 + }, + { + "epoch": 0.35339, + "grad_norm": 0.9696513539574135, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 35339 + }, + { + "epoch": 0.3534, + "grad_norm": 1.064576144927214, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 35340 + }, + { + "epoch": 0.35341, + "grad_norm": 0.9757702989934408, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 35341 + }, + { + "epoch": 0.35342, + "grad_norm": 1.1119081591992588, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 35342 + }, + { + "epoch": 0.35343, + "grad_norm": 0.852659547569621, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 35343 + }, + { + "epoch": 0.35344, + "grad_norm": 0.8834683625706019, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 35344 + }, + { + "epoch": 0.35345, + "grad_norm": 0.756295042970097, + "learning_rate": 0.003, + "loss": 4.074, + "step": 35345 + }, + { + "epoch": 0.35346, + "grad_norm": 0.6875298182348494, + "learning_rate": 0.003, + "loss": 4.023, + "step": 35346 + }, + { + "epoch": 0.35347, + "grad_norm": 0.6839180575795568, + "learning_rate": 0.003, + "loss": 4.055, + "step": 35347 + }, + { + "epoch": 0.35348, + "grad_norm": 0.7869682067176487, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 35348 + }, + { + "epoch": 0.35349, + "grad_norm": 0.9250045873983749, + "learning_rate": 0.003, + "loss": 4.058, + "step": 35349 + }, + { + "epoch": 0.3535, + "grad_norm": 1.020484221466413, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 35350 + }, + { + "epoch": 0.35351, + "grad_norm": 0.9741178258701142, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 35351 + }, + { + "epoch": 0.35352, + "grad_norm": 0.9100336993704421, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 35352 + }, + { + "epoch": 0.35353, + "grad_norm": 0.7671341377722294, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 35353 + }, + { + "epoch": 0.35354, + "grad_norm": 0.7556911563786995, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 35354 + }, + { + "epoch": 0.35355, + "grad_norm": 0.7869474495516388, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 35355 + }, + { + "epoch": 0.35356, + "grad_norm": 0.9216353722263187, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 35356 + }, + { + "epoch": 0.35357, + "grad_norm": 0.9309308206545308, + "learning_rate": 0.003, + "loss": 4.046, + "step": 35357 + }, + { + "epoch": 0.35358, + "grad_norm": 0.8854001884721865, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 35358 + }, + { + "epoch": 0.35359, + "grad_norm": 1.0028556364577885, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 35359 + }, + { + "epoch": 0.3536, + "grad_norm": 1.047990839566149, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 35360 + }, + { + "epoch": 0.35361, + "grad_norm": 0.9188657549870218, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 35361 + }, + { + "epoch": 0.35362, + "grad_norm": 0.8114004795879127, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 35362 + }, + { + "epoch": 0.35363, + "grad_norm": 0.7229253146182816, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 35363 + }, + { + "epoch": 0.35364, + "grad_norm": 0.6568712555270999, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 35364 + }, + { + "epoch": 0.35365, + "grad_norm": 0.6880630045396562, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 35365 + }, + { + "epoch": 0.35366, + "grad_norm": 0.5796398298358054, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 35366 + }, + { + "epoch": 0.35367, + "grad_norm": 0.5247208664326352, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 35367 + }, + { + "epoch": 0.35368, + "grad_norm": 0.5663349124674729, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 35368 + }, + { + "epoch": 0.35369, + "grad_norm": 0.5400870318266853, + "learning_rate": 0.003, + "loss": 3.985, + "step": 35369 + }, + { + "epoch": 0.3537, + "grad_norm": 0.5247435492319512, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 35370 + }, + { + "epoch": 0.35371, + "grad_norm": 0.5998273815954971, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 35371 + }, + { + "epoch": 0.35372, + "grad_norm": 0.6551432583646265, + "learning_rate": 0.003, + "loss": 4.051, + "step": 35372 + }, + { + "epoch": 0.35373, + "grad_norm": 0.7358287292269177, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 35373 + }, + { + "epoch": 0.35374, + "grad_norm": 0.8983625561306371, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 35374 + }, + { + "epoch": 0.35375, + "grad_norm": 1.0235256693710033, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 35375 + }, + { + "epoch": 0.35376, + "grad_norm": 1.1603737671672083, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 35376 + }, + { + "epoch": 0.35377, + "grad_norm": 0.8645561120081271, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 35377 + }, + { + "epoch": 0.35378, + "grad_norm": 0.8046297129629648, + "learning_rate": 0.003, + "loss": 4.038, + "step": 35378 + }, + { + "epoch": 0.35379, + "grad_norm": 0.8221691424420592, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 35379 + }, + { + "epoch": 0.3538, + "grad_norm": 0.7053458891488756, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 35380 + }, + { + "epoch": 0.35381, + "grad_norm": 0.7948976578657914, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 35381 + }, + { + "epoch": 0.35382, + "grad_norm": 0.7864013298484848, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 35382 + }, + { + "epoch": 0.35383, + "grad_norm": 0.850102623055183, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 35383 + }, + { + "epoch": 0.35384, + "grad_norm": 0.8913609993806474, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 35384 + }, + { + "epoch": 0.35385, + "grad_norm": 0.8348017433982378, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 35385 + }, + { + "epoch": 0.35386, + "grad_norm": 0.714294957069829, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 35386 + }, + { + "epoch": 0.35387, + "grad_norm": 0.7006362427376348, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 35387 + }, + { + "epoch": 0.35388, + "grad_norm": 0.8013106496227456, + "learning_rate": 0.003, + "loss": 3.9976, + "step": 35388 + }, + { + "epoch": 0.35389, + "grad_norm": 0.9018824605009831, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 35389 + }, + { + "epoch": 0.3539, + "grad_norm": 1.0607096500433855, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 35390 + }, + { + "epoch": 0.35391, + "grad_norm": 0.9712076058479823, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 35391 + }, + { + "epoch": 0.35392, + "grad_norm": 0.8850733634871047, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 35392 + }, + { + "epoch": 0.35393, + "grad_norm": 0.9064245592099144, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 35393 + }, + { + "epoch": 0.35394, + "grad_norm": 0.9242585077694726, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 35394 + }, + { + "epoch": 0.35395, + "grad_norm": 0.8876698410376317, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 35395 + }, + { + "epoch": 0.35396, + "grad_norm": 0.8660372536059948, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 35396 + }, + { + "epoch": 0.35397, + "grad_norm": 0.8415891691368266, + "learning_rate": 0.003, + "loss": 4.068, + "step": 35397 + }, + { + "epoch": 0.35398, + "grad_norm": 0.9176796834716632, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 35398 + }, + { + "epoch": 0.35399, + "grad_norm": 1.0698904466940191, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 35399 + }, + { + "epoch": 0.354, + "grad_norm": 1.0383445833283258, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 35400 + }, + { + "epoch": 0.35401, + "grad_norm": 1.1196866007761461, + "learning_rate": 0.003, + "loss": 4.086, + "step": 35401 + }, + { + "epoch": 0.35402, + "grad_norm": 0.8912817293772949, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 35402 + }, + { + "epoch": 0.35403, + "grad_norm": 0.7002773235008907, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 35403 + }, + { + "epoch": 0.35404, + "grad_norm": 0.7646154910017946, + "learning_rate": 0.003, + "loss": 4.062, + "step": 35404 + }, + { + "epoch": 0.35405, + "grad_norm": 0.9005732476823645, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 35405 + }, + { + "epoch": 0.35406, + "grad_norm": 1.2007459869668298, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 35406 + }, + { + "epoch": 0.35407, + "grad_norm": 1.0437292645267002, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 35407 + }, + { + "epoch": 0.35408, + "grad_norm": 0.9001446858574974, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 35408 + }, + { + "epoch": 0.35409, + "grad_norm": 0.7628853581995292, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 35409 + }, + { + "epoch": 0.3541, + "grad_norm": 0.7182388758999324, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 35410 + }, + { + "epoch": 0.35411, + "grad_norm": 0.6351385385217057, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 35411 + }, + { + "epoch": 0.35412, + "grad_norm": 0.6617658461659434, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 35412 + }, + { + "epoch": 0.35413, + "grad_norm": 0.7278165094375213, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 35413 + }, + { + "epoch": 0.35414, + "grad_norm": 0.8389160928755836, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 35414 + }, + { + "epoch": 0.35415, + "grad_norm": 0.9094257645524942, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 35415 + }, + { + "epoch": 0.35416, + "grad_norm": 0.9686318858392784, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 35416 + }, + { + "epoch": 0.35417, + "grad_norm": 1.061241103914107, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 35417 + }, + { + "epoch": 0.35418, + "grad_norm": 0.8761443973856404, + "learning_rate": 0.003, + "loss": 4.052, + "step": 35418 + }, + { + "epoch": 0.35419, + "grad_norm": 0.7023794223482236, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 35419 + }, + { + "epoch": 0.3542, + "grad_norm": 0.6642286276421315, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 35420 + }, + { + "epoch": 0.35421, + "grad_norm": 0.5739740805687852, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 35421 + }, + { + "epoch": 0.35422, + "grad_norm": 0.5675688254678937, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 35422 + }, + { + "epoch": 0.35423, + "grad_norm": 0.5488874376116479, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 35423 + }, + { + "epoch": 0.35424, + "grad_norm": 0.6186204671715866, + "learning_rate": 0.003, + "loss": 4.0013, + "step": 35424 + }, + { + "epoch": 0.35425, + "grad_norm": 0.6592067784840431, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 35425 + }, + { + "epoch": 0.35426, + "grad_norm": 0.7059352012527849, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 35426 + }, + { + "epoch": 0.35427, + "grad_norm": 0.7869286816287573, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 35427 + }, + { + "epoch": 0.35428, + "grad_norm": 0.9689694880785674, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 35428 + }, + { + "epoch": 0.35429, + "grad_norm": 0.9817857113715803, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 35429 + }, + { + "epoch": 0.3543, + "grad_norm": 0.7651993872981392, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 35430 + }, + { + "epoch": 0.35431, + "grad_norm": 0.7099705037031928, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 35431 + }, + { + "epoch": 0.35432, + "grad_norm": 0.7017381941545233, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 35432 + }, + { + "epoch": 0.35433, + "grad_norm": 0.581192715987244, + "learning_rate": 0.003, + "loss": 4.0088, + "step": 35433 + }, + { + "epoch": 0.35434, + "grad_norm": 0.6008515088269819, + "learning_rate": 0.003, + "loss": 4.004, + "step": 35434 + }, + { + "epoch": 0.35435, + "grad_norm": 0.7137652236081956, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 35435 + }, + { + "epoch": 0.35436, + "grad_norm": 0.8741253260856033, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 35436 + }, + { + "epoch": 0.35437, + "grad_norm": 0.9158389873368245, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 35437 + }, + { + "epoch": 0.35438, + "grad_norm": 0.9182323811968163, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 35438 + }, + { + "epoch": 0.35439, + "grad_norm": 0.9869966499017926, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 35439 + }, + { + "epoch": 0.3544, + "grad_norm": 1.124228761670888, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 35440 + }, + { + "epoch": 0.35441, + "grad_norm": 0.8920640898219491, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 35441 + }, + { + "epoch": 0.35442, + "grad_norm": 0.8502617671458848, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 35442 + }, + { + "epoch": 0.35443, + "grad_norm": 0.8404157469406964, + "learning_rate": 0.003, + "loss": 4.036, + "step": 35443 + }, + { + "epoch": 0.35444, + "grad_norm": 0.8460198528199361, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 35444 + }, + { + "epoch": 0.35445, + "grad_norm": 0.7754404782796849, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 35445 + }, + { + "epoch": 0.35446, + "grad_norm": 0.8554021724472471, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 35446 + }, + { + "epoch": 0.35447, + "grad_norm": 0.9123742963858626, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 35447 + }, + { + "epoch": 0.35448, + "grad_norm": 0.7759855308659531, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 35448 + }, + { + "epoch": 0.35449, + "grad_norm": 0.8195627593124858, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 35449 + }, + { + "epoch": 0.3545, + "grad_norm": 0.8370337435525346, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 35450 + }, + { + "epoch": 0.35451, + "grad_norm": 0.8780373491212887, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 35451 + }, + { + "epoch": 0.35452, + "grad_norm": 0.9662638733666206, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 35452 + }, + { + "epoch": 0.35453, + "grad_norm": 0.9551058221115036, + "learning_rate": 0.003, + "loss": 4.066, + "step": 35453 + }, + { + "epoch": 0.35454, + "grad_norm": 0.9329223391564759, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 35454 + }, + { + "epoch": 0.35455, + "grad_norm": 0.8095055469220586, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 35455 + }, + { + "epoch": 0.35456, + "grad_norm": 0.7525264242342562, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 35456 + }, + { + "epoch": 0.35457, + "grad_norm": 0.7446229595470634, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 35457 + }, + { + "epoch": 0.35458, + "grad_norm": 0.7606209693406574, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 35458 + }, + { + "epoch": 0.35459, + "grad_norm": 0.8992902480038396, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 35459 + }, + { + "epoch": 0.3546, + "grad_norm": 1.1703342294162382, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 35460 + }, + { + "epoch": 0.35461, + "grad_norm": 0.9816325070035651, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 35461 + }, + { + "epoch": 0.35462, + "grad_norm": 1.0079624051287999, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 35462 + }, + { + "epoch": 0.35463, + "grad_norm": 1.0551909022970964, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 35463 + }, + { + "epoch": 0.35464, + "grad_norm": 1.013437883306526, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 35464 + }, + { + "epoch": 0.35465, + "grad_norm": 0.9240021234396032, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 35465 + }, + { + "epoch": 0.35466, + "grad_norm": 0.7804810265110531, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 35466 + }, + { + "epoch": 0.35467, + "grad_norm": 0.6922883824077382, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 35467 + }, + { + "epoch": 0.35468, + "grad_norm": 0.7730069910482615, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 35468 + }, + { + "epoch": 0.35469, + "grad_norm": 0.8012463096266607, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 35469 + }, + { + "epoch": 0.3547, + "grad_norm": 0.9036135282892178, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 35470 + }, + { + "epoch": 0.35471, + "grad_norm": 1.0024960086696617, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 35471 + }, + { + "epoch": 0.35472, + "grad_norm": 1.246108237269644, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 35472 + }, + { + "epoch": 0.35473, + "grad_norm": 0.6871332843035695, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 35473 + }, + { + "epoch": 0.35474, + "grad_norm": 0.5429466243643678, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 35474 + }, + { + "epoch": 0.35475, + "grad_norm": 0.7004175000602282, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 35475 + }, + { + "epoch": 0.35476, + "grad_norm": 0.7588862515395214, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 35476 + }, + { + "epoch": 0.35477, + "grad_norm": 0.8013748202508281, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 35477 + }, + { + "epoch": 0.35478, + "grad_norm": 0.7686109613683162, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 35478 + }, + { + "epoch": 0.35479, + "grad_norm": 0.7889566830240237, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 35479 + }, + { + "epoch": 0.3548, + "grad_norm": 0.8221127693913399, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 35480 + }, + { + "epoch": 0.35481, + "grad_norm": 0.70090166983967, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 35481 + }, + { + "epoch": 0.35482, + "grad_norm": 0.6113639835146235, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 35482 + }, + { + "epoch": 0.35483, + "grad_norm": 0.5969069035492758, + "learning_rate": 0.003, + "loss": 4.019, + "step": 35483 + }, + { + "epoch": 0.35484, + "grad_norm": 0.6436367649700976, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 35484 + }, + { + "epoch": 0.35485, + "grad_norm": 0.6567738727394041, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 35485 + }, + { + "epoch": 0.35486, + "grad_norm": 0.6655943040058273, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 35486 + }, + { + "epoch": 0.35487, + "grad_norm": 0.5996525797539428, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 35487 + }, + { + "epoch": 0.35488, + "grad_norm": 0.5466117844547181, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 35488 + }, + { + "epoch": 0.35489, + "grad_norm": 0.5212695599076461, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 35489 + }, + { + "epoch": 0.3549, + "grad_norm": 0.5812236963965408, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 35490 + }, + { + "epoch": 0.35491, + "grad_norm": 0.5680105500200279, + "learning_rate": 0.003, + "loss": 3.9829, + "step": 35491 + }, + { + "epoch": 0.35492, + "grad_norm": 0.5819739732184216, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 35492 + }, + { + "epoch": 0.35493, + "grad_norm": 0.6366794945282233, + "learning_rate": 0.003, + "loss": 3.9968, + "step": 35493 + }, + { + "epoch": 0.35494, + "grad_norm": 0.8082036486306512, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 35494 + }, + { + "epoch": 0.35495, + "grad_norm": 1.0810733124370882, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 35495 + }, + { + "epoch": 0.35496, + "grad_norm": 1.1864534428628468, + "learning_rate": 0.003, + "loss": 3.9896, + "step": 35496 + }, + { + "epoch": 0.35497, + "grad_norm": 0.7539216179462753, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 35497 + }, + { + "epoch": 0.35498, + "grad_norm": 0.7157850564539804, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 35498 + }, + { + "epoch": 0.35499, + "grad_norm": 0.7906746239712356, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 35499 + }, + { + "epoch": 0.355, + "grad_norm": 0.8254569531211231, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 35500 + }, + { + "epoch": 0.35501, + "grad_norm": 0.8228127910814118, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 35501 + }, + { + "epoch": 0.35502, + "grad_norm": 0.8677643424650084, + "learning_rate": 0.003, + "loss": 4.025, + "step": 35502 + }, + { + "epoch": 0.35503, + "grad_norm": 0.9274158072269734, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 35503 + }, + { + "epoch": 0.35504, + "grad_norm": 1.0617704511569044, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 35504 + }, + { + "epoch": 0.35505, + "grad_norm": 0.9485630512402317, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 35505 + }, + { + "epoch": 0.35506, + "grad_norm": 0.8931346402986252, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 35506 + }, + { + "epoch": 0.35507, + "grad_norm": 0.8029627701583673, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 35507 + }, + { + "epoch": 0.35508, + "grad_norm": 0.8218367352625724, + "learning_rate": 0.003, + "loss": 4.029, + "step": 35508 + }, + { + "epoch": 0.35509, + "grad_norm": 0.7794129345318784, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 35509 + }, + { + "epoch": 0.3551, + "grad_norm": 0.8999416421444704, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 35510 + }, + { + "epoch": 0.35511, + "grad_norm": 1.1819951273386162, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 35511 + }, + { + "epoch": 0.35512, + "grad_norm": 1.1454617936174853, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 35512 + }, + { + "epoch": 0.35513, + "grad_norm": 0.8841410400493929, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 35513 + }, + { + "epoch": 0.35514, + "grad_norm": 0.9630639338382225, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 35514 + }, + { + "epoch": 0.35515, + "grad_norm": 1.0641706614949202, + "learning_rate": 0.003, + "loss": 4.055, + "step": 35515 + }, + { + "epoch": 0.35516, + "grad_norm": 0.8779166440122825, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 35516 + }, + { + "epoch": 0.35517, + "grad_norm": 0.8342835931354293, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 35517 + }, + { + "epoch": 0.35518, + "grad_norm": 0.7217692896391702, + "learning_rate": 0.003, + "loss": 4.047, + "step": 35518 + }, + { + "epoch": 0.35519, + "grad_norm": 0.8128729445412435, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 35519 + }, + { + "epoch": 0.3552, + "grad_norm": 0.8831018072071011, + "learning_rate": 0.003, + "loss": 4.036, + "step": 35520 + }, + { + "epoch": 0.35521, + "grad_norm": 1.0273283220439666, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 35521 + }, + { + "epoch": 0.35522, + "grad_norm": 1.0414509910057848, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 35522 + }, + { + "epoch": 0.35523, + "grad_norm": 1.0727947371051423, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 35523 + }, + { + "epoch": 0.35524, + "grad_norm": 0.7932233754886141, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 35524 + }, + { + "epoch": 0.35525, + "grad_norm": 0.769114435020755, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 35525 + }, + { + "epoch": 0.35526, + "grad_norm": 0.7468679725824244, + "learning_rate": 0.003, + "loss": 4.067, + "step": 35526 + }, + { + "epoch": 0.35527, + "grad_norm": 0.7833049177634545, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 35527 + }, + { + "epoch": 0.35528, + "grad_norm": 0.8355066727841511, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 35528 + }, + { + "epoch": 0.35529, + "grad_norm": 0.7579379380962419, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 35529 + }, + { + "epoch": 0.3553, + "grad_norm": 0.6645146848088115, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 35530 + }, + { + "epoch": 0.35531, + "grad_norm": 0.6167020852726559, + "learning_rate": 0.003, + "loss": 4.03, + "step": 35531 + }, + { + "epoch": 0.35532, + "grad_norm": 0.6518658342172913, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 35532 + }, + { + "epoch": 0.35533, + "grad_norm": 0.6875152442508491, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 35533 + }, + { + "epoch": 0.35534, + "grad_norm": 0.7086514968768225, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 35534 + }, + { + "epoch": 0.35535, + "grad_norm": 0.7669555376707017, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 35535 + }, + { + "epoch": 0.35536, + "grad_norm": 0.8515685693199104, + "learning_rate": 0.003, + "loss": 4.001, + "step": 35536 + }, + { + "epoch": 0.35537, + "grad_norm": 0.9856250130191145, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 35537 + }, + { + "epoch": 0.35538, + "grad_norm": 1.2720365034123322, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 35538 + }, + { + "epoch": 0.35539, + "grad_norm": 0.7709964096162728, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 35539 + }, + { + "epoch": 0.3554, + "grad_norm": 0.6117648240171862, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 35540 + }, + { + "epoch": 0.35541, + "grad_norm": 0.6265897285479147, + "learning_rate": 0.003, + "loss": 4.0039, + "step": 35541 + }, + { + "epoch": 0.35542, + "grad_norm": 0.7278098155207409, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 35542 + }, + { + "epoch": 0.35543, + "grad_norm": 0.8467449762739124, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 35543 + }, + { + "epoch": 0.35544, + "grad_norm": 0.9980062798865201, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 35544 + }, + { + "epoch": 0.35545, + "grad_norm": 1.0530627611853915, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 35545 + }, + { + "epoch": 0.35546, + "grad_norm": 1.0257438471950722, + "learning_rate": 0.003, + "loss": 4.041, + "step": 35546 + }, + { + "epoch": 0.35547, + "grad_norm": 1.177106759770191, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 35547 + }, + { + "epoch": 0.35548, + "grad_norm": 0.8847480801199755, + "learning_rate": 0.003, + "loss": 4.081, + "step": 35548 + }, + { + "epoch": 0.35549, + "grad_norm": 0.7156875765719646, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 35549 + }, + { + "epoch": 0.3555, + "grad_norm": 0.6587318538034197, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 35550 + }, + { + "epoch": 0.35551, + "grad_norm": 0.6761456268385696, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 35551 + }, + { + "epoch": 0.35552, + "grad_norm": 0.8097477941853418, + "learning_rate": 0.003, + "loss": 4.0111, + "step": 35552 + }, + { + "epoch": 0.35553, + "grad_norm": 0.9704542896312899, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 35553 + }, + { + "epoch": 0.35554, + "grad_norm": 0.9406978980735771, + "learning_rate": 0.003, + "loss": 4.031, + "step": 35554 + }, + { + "epoch": 0.35555, + "grad_norm": 0.8809474996016134, + "learning_rate": 0.003, + "loss": 4.034, + "step": 35555 + }, + { + "epoch": 0.35556, + "grad_norm": 0.7963865416393352, + "learning_rate": 0.003, + "loss": 4.0045, + "step": 35556 + }, + { + "epoch": 0.35557, + "grad_norm": 0.7242247163165031, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 35557 + }, + { + "epoch": 0.35558, + "grad_norm": 0.7328387720765082, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 35558 + }, + { + "epoch": 0.35559, + "grad_norm": 0.8243266882935657, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 35559 + }, + { + "epoch": 0.3556, + "grad_norm": 0.8723530599326395, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 35560 + }, + { + "epoch": 0.35561, + "grad_norm": 1.0566770957594827, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 35561 + }, + { + "epoch": 0.35562, + "grad_norm": 1.028254968726956, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 35562 + }, + { + "epoch": 0.35563, + "grad_norm": 0.9844385949474204, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 35563 + }, + { + "epoch": 0.35564, + "grad_norm": 0.9525308524027485, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 35564 + }, + { + "epoch": 0.35565, + "grad_norm": 0.8997534654437397, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 35565 + }, + { + "epoch": 0.35566, + "grad_norm": 0.8603629761680712, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 35566 + }, + { + "epoch": 0.35567, + "grad_norm": 0.8993786928019031, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 35567 + }, + { + "epoch": 0.35568, + "grad_norm": 0.9832693710122777, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 35568 + }, + { + "epoch": 0.35569, + "grad_norm": 0.8835962914608959, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 35569 + }, + { + "epoch": 0.3557, + "grad_norm": 0.8106325261137978, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 35570 + }, + { + "epoch": 0.35571, + "grad_norm": 0.7680980036144063, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 35571 + }, + { + "epoch": 0.35572, + "grad_norm": 0.8255545515300683, + "learning_rate": 0.003, + "loss": 4.027, + "step": 35572 + }, + { + "epoch": 0.35573, + "grad_norm": 0.8040403273344309, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 35573 + }, + { + "epoch": 0.35574, + "grad_norm": 0.8710634859590739, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 35574 + }, + { + "epoch": 0.35575, + "grad_norm": 0.8826007700576655, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 35575 + }, + { + "epoch": 0.35576, + "grad_norm": 0.8774583482108382, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 35576 + }, + { + "epoch": 0.35577, + "grad_norm": 0.8854128775302702, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 35577 + }, + { + "epoch": 0.35578, + "grad_norm": 0.9199942423913751, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 35578 + }, + { + "epoch": 0.35579, + "grad_norm": 0.9598962740676245, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 35579 + }, + { + "epoch": 0.3558, + "grad_norm": 1.0010455071077455, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 35580 + }, + { + "epoch": 0.35581, + "grad_norm": 0.9870978796097563, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 35581 + }, + { + "epoch": 0.35582, + "grad_norm": 0.9452054621743715, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 35582 + }, + { + "epoch": 0.35583, + "grad_norm": 0.7799836923257747, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 35583 + }, + { + "epoch": 0.35584, + "grad_norm": 0.6823097994412459, + "learning_rate": 0.003, + "loss": 4.037, + "step": 35584 + }, + { + "epoch": 0.35585, + "grad_norm": 0.6329547812475629, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 35585 + }, + { + "epoch": 0.35586, + "grad_norm": 0.6059535786109996, + "learning_rate": 0.003, + "loss": 3.994, + "step": 35586 + }, + { + "epoch": 0.35587, + "grad_norm": 0.6139006554217994, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 35587 + }, + { + "epoch": 0.35588, + "grad_norm": 0.5892930525779317, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 35588 + }, + { + "epoch": 0.35589, + "grad_norm": 0.6463888956744265, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 35589 + }, + { + "epoch": 0.3559, + "grad_norm": 0.7327634667561984, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 35590 + }, + { + "epoch": 0.35591, + "grad_norm": 0.8906705874850709, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 35591 + }, + { + "epoch": 0.35592, + "grad_norm": 1.0079326815533545, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 35592 + }, + { + "epoch": 0.35593, + "grad_norm": 0.9996379490059354, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 35593 + }, + { + "epoch": 0.35594, + "grad_norm": 0.8104297155238523, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 35594 + }, + { + "epoch": 0.35595, + "grad_norm": 0.7245955273114536, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 35595 + }, + { + "epoch": 0.35596, + "grad_norm": 0.8071081999913456, + "learning_rate": 0.003, + "loss": 4.011, + "step": 35596 + }, + { + "epoch": 0.35597, + "grad_norm": 0.8264448881034141, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 35597 + }, + { + "epoch": 0.35598, + "grad_norm": 0.8959183103735588, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 35598 + }, + { + "epoch": 0.35599, + "grad_norm": 0.8856612560928192, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 35599 + }, + { + "epoch": 0.356, + "grad_norm": 0.8213477277748193, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 35600 + }, + { + "epoch": 0.35601, + "grad_norm": 0.6854831352161981, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 35601 + }, + { + "epoch": 0.35602, + "grad_norm": 0.6734393723793403, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 35602 + }, + { + "epoch": 0.35603, + "grad_norm": 0.8434871330075129, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 35603 + }, + { + "epoch": 0.35604, + "grad_norm": 1.0066102226087577, + "learning_rate": 0.003, + "loss": 4.065, + "step": 35604 + }, + { + "epoch": 0.35605, + "grad_norm": 1.024795617466296, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 35605 + }, + { + "epoch": 0.35606, + "grad_norm": 0.8527908522013521, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 35606 + }, + { + "epoch": 0.35607, + "grad_norm": 0.776680642092252, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 35607 + }, + { + "epoch": 0.35608, + "grad_norm": 0.8076314946283357, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 35608 + }, + { + "epoch": 0.35609, + "grad_norm": 0.8409965670546519, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 35609 + }, + { + "epoch": 0.3561, + "grad_norm": 0.751464434316504, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 35610 + }, + { + "epoch": 0.35611, + "grad_norm": 0.6950080605499266, + "learning_rate": 0.003, + "loss": 4.034, + "step": 35611 + }, + { + "epoch": 0.35612, + "grad_norm": 0.6465073485550279, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 35612 + }, + { + "epoch": 0.35613, + "grad_norm": 0.7199595671038781, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 35613 + }, + { + "epoch": 0.35614, + "grad_norm": 0.7937611986605759, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 35614 + }, + { + "epoch": 0.35615, + "grad_norm": 0.8366000998606218, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 35615 + }, + { + "epoch": 0.35616, + "grad_norm": 0.8172045351919865, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 35616 + }, + { + "epoch": 0.35617, + "grad_norm": 0.89973338525607, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 35617 + }, + { + "epoch": 0.35618, + "grad_norm": 0.8788400233400065, + "learning_rate": 0.003, + "loss": 4.043, + "step": 35618 + }, + { + "epoch": 0.35619, + "grad_norm": 0.8674971785907195, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 35619 + }, + { + "epoch": 0.3562, + "grad_norm": 0.8506353834252806, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 35620 + }, + { + "epoch": 0.35621, + "grad_norm": 0.9242726208705895, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 35621 + }, + { + "epoch": 0.35622, + "grad_norm": 0.8950835402909704, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 35622 + }, + { + "epoch": 0.35623, + "grad_norm": 0.7404139085725188, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 35623 + }, + { + "epoch": 0.35624, + "grad_norm": 0.7900457853163831, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 35624 + }, + { + "epoch": 0.35625, + "grad_norm": 0.9475956628830594, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 35625 + }, + { + "epoch": 0.35626, + "grad_norm": 1.2848803834395723, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 35626 + }, + { + "epoch": 0.35627, + "grad_norm": 0.9793321975608914, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 35627 + }, + { + "epoch": 0.35628, + "grad_norm": 0.9363119161820275, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 35628 + }, + { + "epoch": 0.35629, + "grad_norm": 0.8710769346246292, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 35629 + }, + { + "epoch": 0.3563, + "grad_norm": 0.8535843830255943, + "learning_rate": 0.003, + "loss": 4.0066, + "step": 35630 + }, + { + "epoch": 0.35631, + "grad_norm": 0.6993579227361221, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 35631 + }, + { + "epoch": 0.35632, + "grad_norm": 0.6895740148125786, + "learning_rate": 0.003, + "loss": 3.9861, + "step": 35632 + }, + { + "epoch": 0.35633, + "grad_norm": 0.7347109266107757, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 35633 + }, + { + "epoch": 0.35634, + "grad_norm": 0.8182167986920449, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 35634 + }, + { + "epoch": 0.35635, + "grad_norm": 0.8952372440982915, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 35635 + }, + { + "epoch": 0.35636, + "grad_norm": 1.1045496111115596, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 35636 + }, + { + "epoch": 0.35637, + "grad_norm": 1.0206717318220273, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 35637 + }, + { + "epoch": 0.35638, + "grad_norm": 1.117655261302622, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 35638 + }, + { + "epoch": 0.35639, + "grad_norm": 0.8060628125253311, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 35639 + }, + { + "epoch": 0.3564, + "grad_norm": 0.6668476328683659, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 35640 + }, + { + "epoch": 0.35641, + "grad_norm": 0.5854658827694242, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 35641 + }, + { + "epoch": 0.35642, + "grad_norm": 0.5720785686479769, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 35642 + }, + { + "epoch": 0.35643, + "grad_norm": 0.6368651612920068, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 35643 + }, + { + "epoch": 0.35644, + "grad_norm": 0.6709454309389514, + "learning_rate": 0.003, + "loss": 4.038, + "step": 35644 + }, + { + "epoch": 0.35645, + "grad_norm": 0.7428107112251099, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 35645 + }, + { + "epoch": 0.35646, + "grad_norm": 0.8977368645919559, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 35646 + }, + { + "epoch": 0.35647, + "grad_norm": 1.0964348545991895, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 35647 + }, + { + "epoch": 0.35648, + "grad_norm": 1.0898039064099752, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 35648 + }, + { + "epoch": 0.35649, + "grad_norm": 0.868695775301868, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 35649 + }, + { + "epoch": 0.3565, + "grad_norm": 0.6685591742752819, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 35650 + }, + { + "epoch": 0.35651, + "grad_norm": 0.6075220108469472, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 35651 + }, + { + "epoch": 0.35652, + "grad_norm": 0.5597705629615594, + "learning_rate": 0.003, + "loss": 4.054, + "step": 35652 + }, + { + "epoch": 0.35653, + "grad_norm": 0.594995733462146, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 35653 + }, + { + "epoch": 0.35654, + "grad_norm": 0.5772478980983075, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 35654 + }, + { + "epoch": 0.35655, + "grad_norm": 0.5981366146628451, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 35655 + }, + { + "epoch": 0.35656, + "grad_norm": 0.5781208845240513, + "learning_rate": 0.003, + "loss": 3.9879, + "step": 35656 + }, + { + "epoch": 0.35657, + "grad_norm": 0.5292548917462926, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 35657 + }, + { + "epoch": 0.35658, + "grad_norm": 0.5055722217035394, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 35658 + }, + { + "epoch": 0.35659, + "grad_norm": 0.5213369006341231, + "learning_rate": 0.003, + "loss": 3.9975, + "step": 35659 + }, + { + "epoch": 0.3566, + "grad_norm": 0.6134609430403047, + "learning_rate": 0.003, + "loss": 3.9995, + "step": 35660 + }, + { + "epoch": 0.35661, + "grad_norm": 0.6720311495034645, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 35661 + }, + { + "epoch": 0.35662, + "grad_norm": 0.6616594136370846, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 35662 + }, + { + "epoch": 0.35663, + "grad_norm": 0.6545444946597759, + "learning_rate": 0.003, + "loss": 3.9938, + "step": 35663 + }, + { + "epoch": 0.35664, + "grad_norm": 0.6896095792991492, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 35664 + }, + { + "epoch": 0.35665, + "grad_norm": 0.7270167048894088, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 35665 + }, + { + "epoch": 0.35666, + "grad_norm": 0.8708316391014879, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 35666 + }, + { + "epoch": 0.35667, + "grad_norm": 1.202704569954379, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 35667 + }, + { + "epoch": 0.35668, + "grad_norm": 1.070754231139595, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 35668 + }, + { + "epoch": 0.35669, + "grad_norm": 0.8868673610262438, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 35669 + }, + { + "epoch": 0.3567, + "grad_norm": 0.8580689435200328, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 35670 + }, + { + "epoch": 0.35671, + "grad_norm": 0.9056492583376805, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 35671 + }, + { + "epoch": 0.35672, + "grad_norm": 1.0695903535691391, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 35672 + }, + { + "epoch": 0.35673, + "grad_norm": 1.1758185934747443, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 35673 + }, + { + "epoch": 0.35674, + "grad_norm": 0.838267299740318, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 35674 + }, + { + "epoch": 0.35675, + "grad_norm": 0.9059065237059448, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 35675 + }, + { + "epoch": 0.35676, + "grad_norm": 0.8767640996053887, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 35676 + }, + { + "epoch": 0.35677, + "grad_norm": 0.9001532769348776, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 35677 + }, + { + "epoch": 0.35678, + "grad_norm": 1.0044900987688095, + "learning_rate": 0.003, + "loss": 4.073, + "step": 35678 + }, + { + "epoch": 0.35679, + "grad_norm": 1.0595302639842004, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 35679 + }, + { + "epoch": 0.3568, + "grad_norm": 1.0554006121817447, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 35680 + }, + { + "epoch": 0.35681, + "grad_norm": 1.0359540300818586, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 35681 + }, + { + "epoch": 0.35682, + "grad_norm": 1.0496549415094933, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 35682 + }, + { + "epoch": 0.35683, + "grad_norm": 1.1193186233760666, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 35683 + }, + { + "epoch": 0.35684, + "grad_norm": 1.0289417562870988, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 35684 + }, + { + "epoch": 0.35685, + "grad_norm": 0.9443888636827353, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 35685 + }, + { + "epoch": 0.35686, + "grad_norm": 0.8832169588394676, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 35686 + }, + { + "epoch": 0.35687, + "grad_norm": 0.7441475426570914, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 35687 + }, + { + "epoch": 0.35688, + "grad_norm": 0.7371616302804537, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 35688 + }, + { + "epoch": 0.35689, + "grad_norm": 0.7140159926015877, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 35689 + }, + { + "epoch": 0.3569, + "grad_norm": 0.6883988835206376, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 35690 + }, + { + "epoch": 0.35691, + "grad_norm": 0.7476805459953952, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 35691 + }, + { + "epoch": 0.35692, + "grad_norm": 0.7932129633222358, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 35692 + }, + { + "epoch": 0.35693, + "grad_norm": 0.884939421216784, + "learning_rate": 0.003, + "loss": 4.046, + "step": 35693 + }, + { + "epoch": 0.35694, + "grad_norm": 0.9201676174588778, + "learning_rate": 0.003, + "loss": 4.0089, + "step": 35694 + }, + { + "epoch": 0.35695, + "grad_norm": 0.796262460485084, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 35695 + }, + { + "epoch": 0.35696, + "grad_norm": 0.6440851813124607, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 35696 + }, + { + "epoch": 0.35697, + "grad_norm": 0.5752703503067458, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 35697 + }, + { + "epoch": 0.35698, + "grad_norm": 0.6587468210284368, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 35698 + }, + { + "epoch": 0.35699, + "grad_norm": 0.7020238763218456, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 35699 + }, + { + "epoch": 0.357, + "grad_norm": 0.640345949998875, + "learning_rate": 0.003, + "loss": 4.025, + "step": 35700 + }, + { + "epoch": 0.35701, + "grad_norm": 0.6602586334705417, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 35701 + }, + { + "epoch": 0.35702, + "grad_norm": 0.7696306080726952, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 35702 + }, + { + "epoch": 0.35703, + "grad_norm": 0.7729789988296374, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 35703 + }, + { + "epoch": 0.35704, + "grad_norm": 0.7179321852284546, + "learning_rate": 0.003, + "loss": 4.023, + "step": 35704 + }, + { + "epoch": 0.35705, + "grad_norm": 0.8787784929494487, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 35705 + }, + { + "epoch": 0.35706, + "grad_norm": 1.1224697717720735, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 35706 + }, + { + "epoch": 0.35707, + "grad_norm": 0.8999590573780386, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 35707 + }, + { + "epoch": 0.35708, + "grad_norm": 0.7419019613569772, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 35708 + }, + { + "epoch": 0.35709, + "grad_norm": 0.8306810437530958, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 35709 + }, + { + "epoch": 0.3571, + "grad_norm": 0.8955408833203646, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 35710 + }, + { + "epoch": 0.35711, + "grad_norm": 0.9928348083128814, + "learning_rate": 0.003, + "loss": 4.032, + "step": 35711 + }, + { + "epoch": 0.35712, + "grad_norm": 1.1378248510819395, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 35712 + }, + { + "epoch": 0.35713, + "grad_norm": 0.9160755596437987, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 35713 + }, + { + "epoch": 0.35714, + "grad_norm": 0.8228665434239927, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 35714 + }, + { + "epoch": 0.35715, + "grad_norm": 0.8056043565290397, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 35715 + }, + { + "epoch": 0.35716, + "grad_norm": 0.7320479300225768, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 35716 + }, + { + "epoch": 0.35717, + "grad_norm": 0.7923578963997363, + "learning_rate": 0.003, + "loss": 4.013, + "step": 35717 + }, + { + "epoch": 0.35718, + "grad_norm": 0.8158172273819033, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 35718 + }, + { + "epoch": 0.35719, + "grad_norm": 0.8302044476275666, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 35719 + }, + { + "epoch": 0.3572, + "grad_norm": 0.887616083926048, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 35720 + }, + { + "epoch": 0.35721, + "grad_norm": 0.9427311440966861, + "learning_rate": 0.003, + "loss": 4.026, + "step": 35721 + }, + { + "epoch": 0.35722, + "grad_norm": 0.9337743861315516, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 35722 + }, + { + "epoch": 0.35723, + "grad_norm": 0.9599231272601718, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 35723 + }, + { + "epoch": 0.35724, + "grad_norm": 0.8962227981402232, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 35724 + }, + { + "epoch": 0.35725, + "grad_norm": 0.8272482967510436, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 35725 + }, + { + "epoch": 0.35726, + "grad_norm": 0.8710650912849195, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 35726 + }, + { + "epoch": 0.35727, + "grad_norm": 0.8441960372054734, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 35727 + }, + { + "epoch": 0.35728, + "grad_norm": 0.7243374222418736, + "learning_rate": 0.003, + "loss": 4.036, + "step": 35728 + }, + { + "epoch": 0.35729, + "grad_norm": 0.7536326928214573, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 35729 + }, + { + "epoch": 0.3573, + "grad_norm": 0.9254168923490416, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 35730 + }, + { + "epoch": 0.35731, + "grad_norm": 1.1840355043884516, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 35731 + }, + { + "epoch": 0.35732, + "grad_norm": 0.9781255563794017, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 35732 + }, + { + "epoch": 0.35733, + "grad_norm": 1.0797901642510934, + "learning_rate": 0.003, + "loss": 4.072, + "step": 35733 + }, + { + "epoch": 0.35734, + "grad_norm": 0.8965002076531271, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 35734 + }, + { + "epoch": 0.35735, + "grad_norm": 0.8253981263162796, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 35735 + }, + { + "epoch": 0.35736, + "grad_norm": 0.8797476365662505, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 35736 + }, + { + "epoch": 0.35737, + "grad_norm": 0.8168856016840561, + "learning_rate": 0.003, + "loss": 4.038, + "step": 35737 + }, + { + "epoch": 0.35738, + "grad_norm": 0.8269705070885212, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 35738 + }, + { + "epoch": 0.35739, + "grad_norm": 0.6958354973779299, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 35739 + }, + { + "epoch": 0.3574, + "grad_norm": 0.6947639916319921, + "learning_rate": 0.003, + "loss": 4.033, + "step": 35740 + }, + { + "epoch": 0.35741, + "grad_norm": 0.7655183508904077, + "learning_rate": 0.003, + "loss": 4.013, + "step": 35741 + }, + { + "epoch": 0.35742, + "grad_norm": 1.0051490067783835, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 35742 + }, + { + "epoch": 0.35743, + "grad_norm": 1.2664906631774058, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 35743 + }, + { + "epoch": 0.35744, + "grad_norm": 0.6912057699718547, + "learning_rate": 0.003, + "loss": 4.035, + "step": 35744 + }, + { + "epoch": 0.35745, + "grad_norm": 0.6595572185855328, + "learning_rate": 0.003, + "loss": 3.9995, + "step": 35745 + }, + { + "epoch": 0.35746, + "grad_norm": 0.7058174846101571, + "learning_rate": 0.003, + "loss": 4.0011, + "step": 35746 + }, + { + "epoch": 0.35747, + "grad_norm": 0.8339045332934717, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 35747 + }, + { + "epoch": 0.35748, + "grad_norm": 0.9718697434974324, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 35748 + }, + { + "epoch": 0.35749, + "grad_norm": 0.9313153354131446, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 35749 + }, + { + "epoch": 0.3575, + "grad_norm": 0.8200877491281663, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 35750 + }, + { + "epoch": 0.35751, + "grad_norm": 0.9066927403661558, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 35751 + }, + { + "epoch": 0.35752, + "grad_norm": 0.8038414210303122, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 35752 + }, + { + "epoch": 0.35753, + "grad_norm": 0.5934362619105293, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 35753 + }, + { + "epoch": 0.35754, + "grad_norm": 0.6411011935286931, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 35754 + }, + { + "epoch": 0.35755, + "grad_norm": 0.6996753745207122, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 35755 + }, + { + "epoch": 0.35756, + "grad_norm": 0.7255755058558641, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 35756 + }, + { + "epoch": 0.35757, + "grad_norm": 0.7485450940230268, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 35757 + }, + { + "epoch": 0.35758, + "grad_norm": 0.746790779413974, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 35758 + }, + { + "epoch": 0.35759, + "grad_norm": 0.6877561301482958, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 35759 + }, + { + "epoch": 0.3576, + "grad_norm": 0.6074135828916071, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 35760 + }, + { + "epoch": 0.35761, + "grad_norm": 0.6358759129297189, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 35761 + }, + { + "epoch": 0.35762, + "grad_norm": 0.7858513419540549, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 35762 + }, + { + "epoch": 0.35763, + "grad_norm": 1.1489242435958604, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 35763 + }, + { + "epoch": 0.35764, + "grad_norm": 0.9451925504323201, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 35764 + }, + { + "epoch": 0.35765, + "grad_norm": 0.7370724999113019, + "learning_rate": 0.003, + "loss": 4.003, + "step": 35765 + }, + { + "epoch": 0.35766, + "grad_norm": 0.6083271594004961, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 35766 + }, + { + "epoch": 0.35767, + "grad_norm": 0.6358724150449878, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 35767 + }, + { + "epoch": 0.35768, + "grad_norm": 0.7113769266019271, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 35768 + }, + { + "epoch": 0.35769, + "grad_norm": 0.731297909806317, + "learning_rate": 0.003, + "loss": 3.9976, + "step": 35769 + }, + { + "epoch": 0.3577, + "grad_norm": 0.7436818150107795, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 35770 + }, + { + "epoch": 0.35771, + "grad_norm": 0.7989994563124555, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 35771 + }, + { + "epoch": 0.35772, + "grad_norm": 0.8562412550087297, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 35772 + }, + { + "epoch": 0.35773, + "grad_norm": 0.8768679913407814, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 35773 + }, + { + "epoch": 0.35774, + "grad_norm": 0.9254009323210972, + "learning_rate": 0.003, + "loss": 4.0035, + "step": 35774 + }, + { + "epoch": 0.35775, + "grad_norm": 0.9077319144017103, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 35775 + }, + { + "epoch": 0.35776, + "grad_norm": 0.7643280899222715, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 35776 + }, + { + "epoch": 0.35777, + "grad_norm": 0.8753310524115449, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 35777 + }, + { + "epoch": 0.35778, + "grad_norm": 0.9503089406596378, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 35778 + }, + { + "epoch": 0.35779, + "grad_norm": 0.989208203643738, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 35779 + }, + { + "epoch": 0.3578, + "grad_norm": 0.9226713704629453, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 35780 + }, + { + "epoch": 0.35781, + "grad_norm": 0.9476995824285103, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 35781 + }, + { + "epoch": 0.35782, + "grad_norm": 0.9506169621962408, + "learning_rate": 0.003, + "loss": 4.045, + "step": 35782 + }, + { + "epoch": 0.35783, + "grad_norm": 1.0956381327555391, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 35783 + }, + { + "epoch": 0.35784, + "grad_norm": 1.2110592107460763, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 35784 + }, + { + "epoch": 0.35785, + "grad_norm": 0.8052704752057784, + "learning_rate": 0.003, + "loss": 4.019, + "step": 35785 + }, + { + "epoch": 0.35786, + "grad_norm": 0.6404169853516094, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 35786 + }, + { + "epoch": 0.35787, + "grad_norm": 0.7146104427398803, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 35787 + }, + { + "epoch": 0.35788, + "grad_norm": 0.8991411899150431, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 35788 + }, + { + "epoch": 0.35789, + "grad_norm": 1.115339881941168, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 35789 + }, + { + "epoch": 0.3579, + "grad_norm": 1.104152317999324, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 35790 + }, + { + "epoch": 0.35791, + "grad_norm": 0.8595676893025892, + "learning_rate": 0.003, + "loss": 3.9982, + "step": 35791 + }, + { + "epoch": 0.35792, + "grad_norm": 0.8123888232614761, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 35792 + }, + { + "epoch": 0.35793, + "grad_norm": 0.7562181224453615, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 35793 + }, + { + "epoch": 0.35794, + "grad_norm": 0.767511110378619, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 35794 + }, + { + "epoch": 0.35795, + "grad_norm": 0.840848489941889, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 35795 + }, + { + "epoch": 0.35796, + "grad_norm": 0.922914159958431, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 35796 + }, + { + "epoch": 0.35797, + "grad_norm": 1.0846332374797876, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 35797 + }, + { + "epoch": 0.35798, + "grad_norm": 0.8021538767525749, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 35798 + }, + { + "epoch": 0.35799, + "grad_norm": 0.7323737307816428, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 35799 + }, + { + "epoch": 0.358, + "grad_norm": 0.7311887990612411, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 35800 + }, + { + "epoch": 0.35801, + "grad_norm": 0.5727139715453451, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 35801 + }, + { + "epoch": 0.35802, + "grad_norm": 0.5897442276136924, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 35802 + }, + { + "epoch": 0.35803, + "grad_norm": 0.6983796947891424, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 35803 + }, + { + "epoch": 0.35804, + "grad_norm": 0.7837855446886233, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 35804 + }, + { + "epoch": 0.35805, + "grad_norm": 0.8682041559614913, + "learning_rate": 0.003, + "loss": 4.057, + "step": 35805 + }, + { + "epoch": 0.35806, + "grad_norm": 0.969551002817275, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 35806 + }, + { + "epoch": 0.35807, + "grad_norm": 1.0287386012304156, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 35807 + }, + { + "epoch": 0.35808, + "grad_norm": 0.8580712247624265, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 35808 + }, + { + "epoch": 0.35809, + "grad_norm": 0.7544976513189149, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 35809 + }, + { + "epoch": 0.3581, + "grad_norm": 0.6977270295680401, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 35810 + }, + { + "epoch": 0.35811, + "grad_norm": 0.8201142515078498, + "learning_rate": 0.003, + "loss": 4.053, + "step": 35811 + }, + { + "epoch": 0.35812, + "grad_norm": 0.8830025438106857, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 35812 + }, + { + "epoch": 0.35813, + "grad_norm": 0.9584571491655394, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 35813 + }, + { + "epoch": 0.35814, + "grad_norm": 0.9000776794277944, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 35814 + }, + { + "epoch": 0.35815, + "grad_norm": 0.7658226606278749, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 35815 + }, + { + "epoch": 0.35816, + "grad_norm": 0.8733202301812838, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 35816 + }, + { + "epoch": 0.35817, + "grad_norm": 0.9466515496795979, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 35817 + }, + { + "epoch": 0.35818, + "grad_norm": 0.8312895394016373, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 35818 + }, + { + "epoch": 0.35819, + "grad_norm": 0.8807284835195941, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 35819 + }, + { + "epoch": 0.3582, + "grad_norm": 0.9770344746294646, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 35820 + }, + { + "epoch": 0.35821, + "grad_norm": 1.2742784317916769, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 35821 + }, + { + "epoch": 0.35822, + "grad_norm": 0.9587846759605987, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 35822 + }, + { + "epoch": 0.35823, + "grad_norm": 1.006576833377721, + "learning_rate": 0.003, + "loss": 4.06, + "step": 35823 + }, + { + "epoch": 0.35824, + "grad_norm": 1.028722691839176, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 35824 + }, + { + "epoch": 0.35825, + "grad_norm": 1.0085291659514588, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 35825 + }, + { + "epoch": 0.35826, + "grad_norm": 0.8882942999500639, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 35826 + }, + { + "epoch": 0.35827, + "grad_norm": 0.9628206649503425, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 35827 + }, + { + "epoch": 0.35828, + "grad_norm": 0.9624857453553891, + "learning_rate": 0.003, + "loss": 4.071, + "step": 35828 + }, + { + "epoch": 0.35829, + "grad_norm": 0.9477547563462154, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 35829 + }, + { + "epoch": 0.3583, + "grad_norm": 1.0562872739994562, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 35830 + }, + { + "epoch": 0.35831, + "grad_norm": 0.9062565818936928, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 35831 + }, + { + "epoch": 0.35832, + "grad_norm": 0.8462554693802367, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 35832 + }, + { + "epoch": 0.35833, + "grad_norm": 0.7993484928513301, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 35833 + }, + { + "epoch": 0.35834, + "grad_norm": 0.787445764699777, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 35834 + }, + { + "epoch": 0.35835, + "grad_norm": 0.8014684316455056, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 35835 + }, + { + "epoch": 0.35836, + "grad_norm": 0.8925262608560436, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 35836 + }, + { + "epoch": 0.35837, + "grad_norm": 0.9491587356609208, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 35837 + }, + { + "epoch": 0.35838, + "grad_norm": 0.8048143487825181, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 35838 + }, + { + "epoch": 0.35839, + "grad_norm": 0.6215864907299954, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 35839 + }, + { + "epoch": 0.3584, + "grad_norm": 0.5918753385135883, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 35840 + }, + { + "epoch": 0.35841, + "grad_norm": 0.5635948495219978, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 35841 + }, + { + "epoch": 0.35842, + "grad_norm": 0.53481792365274, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 35842 + }, + { + "epoch": 0.35843, + "grad_norm": 0.6311979982127771, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 35843 + }, + { + "epoch": 0.35844, + "grad_norm": 0.7606152889723282, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 35844 + }, + { + "epoch": 0.35845, + "grad_norm": 0.9191801092473408, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 35845 + }, + { + "epoch": 0.35846, + "grad_norm": 1.126926836154685, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 35846 + }, + { + "epoch": 0.35847, + "grad_norm": 0.9016179542828027, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 35847 + }, + { + "epoch": 0.35848, + "grad_norm": 0.7456965502743051, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 35848 + }, + { + "epoch": 0.35849, + "grad_norm": 0.6055919165224919, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 35849 + }, + { + "epoch": 0.3585, + "grad_norm": 0.5926808223111274, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 35850 + }, + { + "epoch": 0.35851, + "grad_norm": 0.6435694033811742, + "learning_rate": 0.003, + "loss": 4.0066, + "step": 35851 + }, + { + "epoch": 0.35852, + "grad_norm": 0.6336047419389741, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 35852 + }, + { + "epoch": 0.35853, + "grad_norm": 0.6080603726646344, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 35853 + }, + { + "epoch": 0.35854, + "grad_norm": 0.5773159298418195, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 35854 + }, + { + "epoch": 0.35855, + "grad_norm": 0.5894811806845447, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 35855 + }, + { + "epoch": 0.35856, + "grad_norm": 0.5769730868359955, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 35856 + }, + { + "epoch": 0.35857, + "grad_norm": 0.697784314023132, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 35857 + }, + { + "epoch": 0.35858, + "grad_norm": 0.9553486091938277, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 35858 + }, + { + "epoch": 0.35859, + "grad_norm": 1.2720336755106505, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 35859 + }, + { + "epoch": 0.3586, + "grad_norm": 0.6882596422330394, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 35860 + }, + { + "epoch": 0.35861, + "grad_norm": 0.6588307229709995, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 35861 + }, + { + "epoch": 0.35862, + "grad_norm": 0.736417974897671, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 35862 + }, + { + "epoch": 0.35863, + "grad_norm": 0.7649227408945096, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 35863 + }, + { + "epoch": 0.35864, + "grad_norm": 0.760987825956093, + "learning_rate": 0.003, + "loss": 4.0028, + "step": 35864 + }, + { + "epoch": 0.35865, + "grad_norm": 0.7540592879390556, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 35865 + }, + { + "epoch": 0.35866, + "grad_norm": 0.8059999608284294, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 35866 + }, + { + "epoch": 0.35867, + "grad_norm": 0.909376605840871, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 35867 + }, + { + "epoch": 0.35868, + "grad_norm": 1.0702820025786035, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 35868 + }, + { + "epoch": 0.35869, + "grad_norm": 1.0668750690656204, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 35869 + }, + { + "epoch": 0.3587, + "grad_norm": 0.9771639813335892, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 35870 + }, + { + "epoch": 0.35871, + "grad_norm": 0.9388381545406745, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 35871 + }, + { + "epoch": 0.35872, + "grad_norm": 0.9316609735920678, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 35872 + }, + { + "epoch": 0.35873, + "grad_norm": 0.8275491269716966, + "learning_rate": 0.003, + "loss": 4.033, + "step": 35873 + }, + { + "epoch": 0.35874, + "grad_norm": 0.8904251615424199, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 35874 + }, + { + "epoch": 0.35875, + "grad_norm": 0.8311514105160075, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 35875 + }, + { + "epoch": 0.35876, + "grad_norm": 0.8233615428886373, + "learning_rate": 0.003, + "loss": 4.0072, + "step": 35876 + }, + { + "epoch": 0.35877, + "grad_norm": 0.8802855925778794, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 35877 + }, + { + "epoch": 0.35878, + "grad_norm": 0.9232019989122032, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 35878 + }, + { + "epoch": 0.35879, + "grad_norm": 0.9638917866888298, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 35879 + }, + { + "epoch": 0.3588, + "grad_norm": 1.0411728515588798, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 35880 + }, + { + "epoch": 0.35881, + "grad_norm": 0.7350074992996689, + "learning_rate": 0.003, + "loss": 4.014, + "step": 35881 + }, + { + "epoch": 0.35882, + "grad_norm": 0.6678018743584874, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 35882 + }, + { + "epoch": 0.35883, + "grad_norm": 0.6688161464951423, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 35883 + }, + { + "epoch": 0.35884, + "grad_norm": 0.6875551857424248, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 35884 + }, + { + "epoch": 0.35885, + "grad_norm": 0.664680453722645, + "learning_rate": 0.003, + "loss": 4.051, + "step": 35885 + }, + { + "epoch": 0.35886, + "grad_norm": 0.7533618505561995, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 35886 + }, + { + "epoch": 0.35887, + "grad_norm": 0.8581671096389137, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 35887 + }, + { + "epoch": 0.35888, + "grad_norm": 0.9396249143604423, + "learning_rate": 0.003, + "loss": 4.056, + "step": 35888 + }, + { + "epoch": 0.35889, + "grad_norm": 1.0283851510310789, + "learning_rate": 0.003, + "loss": 4.038, + "step": 35889 + }, + { + "epoch": 0.3589, + "grad_norm": 1.1040121933510236, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 35890 + }, + { + "epoch": 0.35891, + "grad_norm": 0.8564892923471411, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 35891 + }, + { + "epoch": 0.35892, + "grad_norm": 0.8420840487431895, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 35892 + }, + { + "epoch": 0.35893, + "grad_norm": 0.954698804571236, + "learning_rate": 0.003, + "loss": 4.0072, + "step": 35893 + }, + { + "epoch": 0.35894, + "grad_norm": 1.0649186116898905, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 35894 + }, + { + "epoch": 0.35895, + "grad_norm": 1.006670972825318, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 35895 + }, + { + "epoch": 0.35896, + "grad_norm": 0.9026554313297486, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 35896 + }, + { + "epoch": 0.35897, + "grad_norm": 0.8334738032550582, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 35897 + }, + { + "epoch": 0.35898, + "grad_norm": 0.9443334749453474, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 35898 + }, + { + "epoch": 0.35899, + "grad_norm": 1.1087248889912282, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 35899 + }, + { + "epoch": 0.359, + "grad_norm": 1.0176737131204736, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 35900 + }, + { + "epoch": 0.35901, + "grad_norm": 0.925975943028702, + "learning_rate": 0.003, + "loss": 4.047, + "step": 35901 + }, + { + "epoch": 0.35902, + "grad_norm": 0.9553335022742603, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 35902 + }, + { + "epoch": 0.35903, + "grad_norm": 0.96660447781142, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 35903 + }, + { + "epoch": 0.35904, + "grad_norm": 0.8600038327906341, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 35904 + }, + { + "epoch": 0.35905, + "grad_norm": 0.7845353560503746, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 35905 + }, + { + "epoch": 0.35906, + "grad_norm": 0.6720916719612227, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 35906 + }, + { + "epoch": 0.35907, + "grad_norm": 0.6144870766152345, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 35907 + }, + { + "epoch": 0.35908, + "grad_norm": 0.5876846812451021, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 35908 + }, + { + "epoch": 0.35909, + "grad_norm": 0.6308608548336441, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 35909 + }, + { + "epoch": 0.3591, + "grad_norm": 0.62943823472297, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 35910 + }, + { + "epoch": 0.35911, + "grad_norm": 0.6644537495559605, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 35911 + }, + { + "epoch": 0.35912, + "grad_norm": 0.6264166446795104, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 35912 + }, + { + "epoch": 0.35913, + "grad_norm": 0.6138886970791118, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 35913 + }, + { + "epoch": 0.35914, + "grad_norm": 0.7229476627746637, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 35914 + }, + { + "epoch": 0.35915, + "grad_norm": 0.8577840609265494, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 35915 + }, + { + "epoch": 0.35916, + "grad_norm": 0.809921476185462, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 35916 + }, + { + "epoch": 0.35917, + "grad_norm": 0.8604316911744897, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 35917 + }, + { + "epoch": 0.35918, + "grad_norm": 0.9068764805890928, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 35918 + }, + { + "epoch": 0.35919, + "grad_norm": 0.9199608103285492, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 35919 + }, + { + "epoch": 0.3592, + "grad_norm": 0.8555724367436517, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 35920 + }, + { + "epoch": 0.35921, + "grad_norm": 0.7963735112223053, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 35921 + }, + { + "epoch": 0.35922, + "grad_norm": 0.7501068259878613, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 35922 + }, + { + "epoch": 0.35923, + "grad_norm": 0.6962561149086551, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 35923 + }, + { + "epoch": 0.35924, + "grad_norm": 0.7084535846556932, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 35924 + }, + { + "epoch": 0.35925, + "grad_norm": 0.6912773609407928, + "learning_rate": 0.003, + "loss": 4.038, + "step": 35925 + }, + { + "epoch": 0.35926, + "grad_norm": 0.7374355237727781, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 35926 + }, + { + "epoch": 0.35927, + "grad_norm": 0.8630800860790284, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 35927 + }, + { + "epoch": 0.35928, + "grad_norm": 1.1003344862148627, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 35928 + }, + { + "epoch": 0.35929, + "grad_norm": 1.1512845731859822, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 35929 + }, + { + "epoch": 0.3593, + "grad_norm": 0.7765597474530024, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 35930 + }, + { + "epoch": 0.35931, + "grad_norm": 0.7153068798408476, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 35931 + }, + { + "epoch": 0.35932, + "grad_norm": 0.8150397236262578, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 35932 + }, + { + "epoch": 0.35933, + "grad_norm": 0.8284141375687092, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 35933 + }, + { + "epoch": 0.35934, + "grad_norm": 0.7731974352973713, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 35934 + }, + { + "epoch": 0.35935, + "grad_norm": 0.7049902229889523, + "learning_rate": 0.003, + "loss": 4.0056, + "step": 35935 + }, + { + "epoch": 0.35936, + "grad_norm": 0.7028124229101087, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 35936 + }, + { + "epoch": 0.35937, + "grad_norm": 0.735761832258226, + "learning_rate": 0.003, + "loss": 4.032, + "step": 35937 + }, + { + "epoch": 0.35938, + "grad_norm": 0.8695468151747444, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 35938 + }, + { + "epoch": 0.35939, + "grad_norm": 0.9182114629786018, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 35939 + }, + { + "epoch": 0.3594, + "grad_norm": 0.9592484474695742, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 35940 + }, + { + "epoch": 0.35941, + "grad_norm": 1.0174974604142175, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 35941 + }, + { + "epoch": 0.35942, + "grad_norm": 0.9189207856229025, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 35942 + }, + { + "epoch": 0.35943, + "grad_norm": 0.7582563193506823, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 35943 + }, + { + "epoch": 0.35944, + "grad_norm": 0.7949544930176887, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 35944 + }, + { + "epoch": 0.35945, + "grad_norm": 0.8463110646232557, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 35945 + }, + { + "epoch": 0.35946, + "grad_norm": 0.9854405820681579, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 35946 + }, + { + "epoch": 0.35947, + "grad_norm": 1.1100310605662145, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 35947 + }, + { + "epoch": 0.35948, + "grad_norm": 0.8146795702447497, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 35948 + }, + { + "epoch": 0.35949, + "grad_norm": 0.8078292109636466, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 35949 + }, + { + "epoch": 0.3595, + "grad_norm": 0.8036254626730283, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 35950 + }, + { + "epoch": 0.35951, + "grad_norm": 0.850089158673278, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 35951 + }, + { + "epoch": 0.35952, + "grad_norm": 0.9845374415394457, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 35952 + }, + { + "epoch": 0.35953, + "grad_norm": 0.9822529068123973, + "learning_rate": 0.003, + "loss": 4.049, + "step": 35953 + }, + { + "epoch": 0.35954, + "grad_norm": 0.7769876229293872, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 35954 + }, + { + "epoch": 0.35955, + "grad_norm": 0.7518321331407024, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 35955 + }, + { + "epoch": 0.35956, + "grad_norm": 0.7998332826835556, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 35956 + }, + { + "epoch": 0.35957, + "grad_norm": 0.7441501239077445, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 35957 + }, + { + "epoch": 0.35958, + "grad_norm": 0.7334715441549721, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 35958 + }, + { + "epoch": 0.35959, + "grad_norm": 0.7550967048413534, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 35959 + }, + { + "epoch": 0.3596, + "grad_norm": 0.8731127302711513, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 35960 + }, + { + "epoch": 0.35961, + "grad_norm": 0.9814649973342221, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 35961 + }, + { + "epoch": 0.35962, + "grad_norm": 0.9793736134364558, + "learning_rate": 0.003, + "loss": 4.018, + "step": 35962 + }, + { + "epoch": 0.35963, + "grad_norm": 0.8648079461882516, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 35963 + }, + { + "epoch": 0.35964, + "grad_norm": 0.7648642421468954, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 35964 + }, + { + "epoch": 0.35965, + "grad_norm": 0.8309324924905529, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 35965 + }, + { + "epoch": 0.35966, + "grad_norm": 0.8038760318471242, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 35966 + }, + { + "epoch": 0.35967, + "grad_norm": 0.8498864404603288, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 35967 + }, + { + "epoch": 0.35968, + "grad_norm": 0.949733729900237, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 35968 + }, + { + "epoch": 0.35969, + "grad_norm": 0.9891045457364769, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 35969 + }, + { + "epoch": 0.3597, + "grad_norm": 0.8473434221274576, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 35970 + }, + { + "epoch": 0.35971, + "grad_norm": 0.7529946230716891, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 35971 + }, + { + "epoch": 0.35972, + "grad_norm": 0.8469415156044917, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 35972 + }, + { + "epoch": 0.35973, + "grad_norm": 1.0225938944613675, + "learning_rate": 0.003, + "loss": 4.053, + "step": 35973 + }, + { + "epoch": 0.35974, + "grad_norm": 1.1885124752805654, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 35974 + }, + { + "epoch": 0.35975, + "grad_norm": 0.7377708136575195, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 35975 + }, + { + "epoch": 0.35976, + "grad_norm": 0.6108701028578247, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 35976 + }, + { + "epoch": 0.35977, + "grad_norm": 0.6763047040537055, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 35977 + }, + { + "epoch": 0.35978, + "grad_norm": 0.6577297866494564, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 35978 + }, + { + "epoch": 0.35979, + "grad_norm": 0.6176876476517045, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 35979 + }, + { + "epoch": 0.3598, + "grad_norm": 0.5678339640433867, + "learning_rate": 0.003, + "loss": 3.9983, + "step": 35980 + }, + { + "epoch": 0.35981, + "grad_norm": 0.631091271136768, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 35981 + }, + { + "epoch": 0.35982, + "grad_norm": 0.6508105398155135, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 35982 + }, + { + "epoch": 0.35983, + "grad_norm": 0.6580766819813928, + "learning_rate": 0.003, + "loss": 4.021, + "step": 35983 + }, + { + "epoch": 0.35984, + "grad_norm": 0.6357600424279186, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 35984 + }, + { + "epoch": 0.35985, + "grad_norm": 0.6335636324062313, + "learning_rate": 0.003, + "loss": 3.9899, + "step": 35985 + }, + { + "epoch": 0.35986, + "grad_norm": 0.7525869457763189, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 35986 + }, + { + "epoch": 0.35987, + "grad_norm": 1.025104682863561, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 35987 + }, + { + "epoch": 0.35988, + "grad_norm": 1.0752689506830986, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 35988 + }, + { + "epoch": 0.35989, + "grad_norm": 0.958530850944837, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 35989 + }, + { + "epoch": 0.3599, + "grad_norm": 1.06229076255523, + "learning_rate": 0.003, + "loss": 4.046, + "step": 35990 + }, + { + "epoch": 0.35991, + "grad_norm": 0.8641403115919098, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 35991 + }, + { + "epoch": 0.35992, + "grad_norm": 0.8174529895114744, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 35992 + }, + { + "epoch": 0.35993, + "grad_norm": 0.8002915089135579, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 35993 + }, + { + "epoch": 0.35994, + "grad_norm": 0.8253868223067767, + "learning_rate": 0.003, + "loss": 4.061, + "step": 35994 + }, + { + "epoch": 0.35995, + "grad_norm": 0.9035360802492447, + "learning_rate": 0.003, + "loss": 4.037, + "step": 35995 + }, + { + "epoch": 0.35996, + "grad_norm": 0.9999856265904967, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 35996 + }, + { + "epoch": 0.35997, + "grad_norm": 0.9207725467796689, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 35997 + }, + { + "epoch": 0.35998, + "grad_norm": 0.9007772937057376, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 35998 + }, + { + "epoch": 0.35999, + "grad_norm": 0.9153076250849782, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 35999 + }, + { + "epoch": 0.36, + "grad_norm": 1.0456260840601326, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 36000 + } + ], + "logging_steps": 1, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.427171142795264e+18, + "train_batch_size": 1024, + "trial_name": null, + "trial_params": null +}